threads <- 4
pairs <- list(
  "AR" = c("E2", "E2DHT"),
  "H3K27ac" = c("E2", "E2DHT")
)
library(tidyverse)
library(glue)
library(pander)
library(yaml)
library(reactable)
library(plyranges)
library(rtracklayer)
library(VennDiagram)
library(UpSetR)
library(magrittr)
library(scales)
library(ggrepel)
library(rlang)
library(ggside)
library(msigdbr)
library(htmltools)
library(goseq)
library(parallel)
library(GenomicInteractions)
library(extraChIPs)
library(ggraph)
library(tidygraph)
source(here::here("workflow", "scripts", "custom_functions.R"))
panderOptions("big.mark", ",")
panderOptions("missing", "")
panderOptions("table.split.table", Inf)
theme_set(
  theme_bw() +
    theme(
      plot.title = element_text(hjust = 0.5)
    )
)
config <- read_yaml(here::here("config", "config.yml"))
params <- read_yaml(here::here("config", "params.yml"))
targets <- names(pairs)
macs2_path <- here::here("output", "macs2", targets)
fdr_alpha <- config$comparisons$fdr
comps <- targets
if (length(unique(targets)) == 1) {
  comps <- seq_along(pairs) %>%
    vapply(
      function(x) paste(
        # names(pairs)[[x]], 
        paste(rev(pairs[[x]]), collapse = " Vs. ")#, 
        # sep = ": "
      ), 
      character(1)
    )
}
n_targets = length(unique(names(pairs)))
samples <- file.path(macs2_path, "qc_samples.tsv") %>% 
  lapply(read_tsv) %>% 
  lapply(mutate_all, as.character) %>% 
  bind_rows() %>% 
  dplyr::filter(
    qc == "pass",
    (target == names(pairs)[[1]] & treat %in% pairs[[1]]) |
      (target == names(pairs)[[2]] & treat %in% pairs[[2]])
  ) %>% 
  unite(label, target, label, remove = FALSE) %>% 
  mutate(
    target = factor(target, levels = unique(targets)),
    treat = factor(treat, levels = unique(unlist(pairs)))
  ) %>% 
  dplyr::select(-qc) %>% 
  droplevels()
stopifnot(nrow(samples) > 0)
rep_col <- setdiff(
  colnames(samples), c("sample", "treat", "target", "input", "label", "qc")
)
samples[[rep_col]] <- as.factor(samples[[rep_col]])
annotation_path <- here::here("output", "annotations")
colours <- read_rds(file.path(annotation_path, "colours.rds"))
treat_colours <- unlist(colours$treat[levels(samples$treat)])
combs <- paste(
  rep(comps, each = 4), c("Up", "Down", "Unchanged", "Undetected")
) %>% 
  combn(2) %>% 
  t() %>% 
  set_colnames(c("S1", "S2")) %>% 
  as_tibble() %>%
  mutate(
    C1 = str_remove(S1, " (Up|Down|Unchanged|Undetected)"),
    C2 = str_remove(S2, " (Up|Down|Unchanged|Undetected)"),
  ) %>%
  dplyr::filter(C1 != C2) %>%
  unite(combn, S1, S2, sep = " - ") %>%
  pull("combn")
direction_colours <- c(
  '#e6194B', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#42d4f4', '#f032e6', 
  '#fabed4', '#469990', '#dcbeff', '#9A6324', '#fffac8', '#800000', '#aaffc3', 
  '#000075', '#a9a9a9'
) %>% # https://sashamaps.net/docs/resources/20-colors/
  setNames(combs) %>% 
  c("Other" = "#4d4d4d")
comp_cols <- setNames(hcl.colors(3, "Zissou1", rev = TRUE), c(comps, "Both"))
cb <- config$genome$build %>% 
  str_to_lower() %>% 
  paste0(".cytobands") 
data(list = cb)
bands_df <- get(cb)
sq <- read_rds(file.path(annotation_path, "seqinfo.rds"))

all_gr <- file.path(annotation_path, "all_gr.rds") %>% 
  read_rds()
id2gene <- setNames(all_gr$gene$gene_name, all_gr$gene$gene_id)
trans_models <- file.path(annotation_path, "trans_models.rds") %>% 
  read_rds() 
genes_gr <- all_gr$gene
tss <- file.path(annotation_path, "tss.rds") %>% 
  read_rds()

blacklist <- here::here(annotation_path, "blacklist.bed.gz") %>%
  import.bed(seqinfo = sq) %>%
  sort()
external_features <- c()
has_features <- FALSE
if (!is.null(config$external$features)) {
  external_features <- suppressWarnings(
    import.gff(here::here(config$external$features), genome = sq)
  )
  keep_cols <- !vapply(
    mcols(external_features), function(x) all(is.na(x)), logical(1)
  )
  mcols(external_features) <- mcols(external_features)[keep_cols]
  has_features <- TRUE
  feature_colours <- colours$features %>% 
    setNames(str_sep_to_title(names(.)))
}
gene_regions <- read_rds(file.path(annotation_path, "gene_regions.rds"))
regions <- vapply(gene_regions, function(x) unique(x$region), character(1))
region_colours <-  unlist(colours$regions) %>% setNames(regions[names(.)])
rna_path <- here::here(config$external$rnaseq)
rnaseq <- tibble(gene_id = character(), gene_name = character())
if (length(rna_path) > 0) {
  stopifnot(file.exists(rna_path))
  if (str_detect(rna_path, "tsv$")) rnaseq <- read_tsv(rna_path)
  if (str_detect(rna_path, "csv$")) rnaseq <- read_csv(rna_path)
  if (!"gene_id" %in% colnames(rnaseq)) stop("Supplied RNA-Seq data must contain the column 'gene_id'")
  genes_gr <- subset(genes_gr, gene_id %in% rnaseq$gene_id)
}
has_rnaseq <- as.logical(nrow(rnaseq))
tx_col <- intersect(c("tx_id", "transcript_id"), colnames(rnaseq))
rna_gr_col <- ifelse(length(tx_col) > 0, "transcript_id", "gene_id")
rna_col <- c(tx_col, "gene_id")[[1]]
hic <- GInteractions()
hic_path <- here::here(config$external$hic)
has_hic <- FALSE
if (length(hic_path) > 0)
  if (file.exists(hic_path)) {
    has_hic <- TRUE
    hic <- makeGenomicInteractionsFromFile(hic_path, type = "bedpe")
    reg_combs <- expand.grid(regions, regions) %>% 
      as.matrix() %>% 
      apply(
        MARGIN = 1, 
        function(x) {
          x <- sort(factor(x, levels = regions))
          paste(as.character(x), collapse = " - ")
        }
      ) %>% 
      unique()
    hic$regions <- anchors(hic) %>% 
      vapply(
        bestOverlap,
        y = GRangesList(lapply(gene_regions, granges)),
        character(length(hic))
      ) %>% 
      apply(MARGIN = 2, function(x) regions[x]) %>% 
      apply(
        MARGIN = 1, 
        function(x) {
          x <- sort(factor(x, levels = regions))
          paste(as.character(x), collapse = " - ")
        }
      ) %>% 
      factor(levels = reg_combs) %>%
      fct_relabel(
        str_replace_all,
        pattern = "Promoter \\([0-9kbp/\\+-]+\\)", replacement = "Promoter"
      )
    if (has_features) {
      feat_combs <- expand.grid(names(feature_colours), names(feature_colours)) %>% 
        as.matrix() %>% 
        apply(
          MARGIN = 1, 
          function(x) {
            x <- sort(factor(x, levels = names(feature_colours)))
            paste(as.character(x), collapse = " - ")
          }
        ) %>% 
        unique()
      hic$features <- vapply(
        anchors(hic),
        function(x) bestOverlap(
          x, external_features, var = "feature", missing = "no_feature"
        ),
        character(length(hic))
      )  %>% 
        apply(
          MARGIN = 1, 
          function(x) {
            x <- sort(factor(x, levels = names(feature_colours)))
            paste(as.character(x), collapse = " - ")
          }
        ) %>% 
        factor(levels = feat_combs) %>%
        fct_relabel(str_sep_to_title, pattern = "_")
    }
  }
stopifnot(is(hic, "GInteractions"))
seqlevels(hic) <- seqlevels(sq)
seqinfo(hic) <- sq
hic <- hic[!overlapsAny(hic, blacklist)]
oracle_peaks <- file.path(macs2_path, "oracle_peaks.rds") %>%
  unique() %>% 
  sapply(read_rds) %>% 
  setNames(unique(basename(macs2_path)))
consensus_peaks <- file.path(macs2_path, "consensus_peaks.bed") %>%
  lapply(import.bed, seqinfo = sq) %>%
  setNames(basename(macs2_path))
all_consensus <- consensus_peaks %>% 
  GRangesList() %>% 
  unlist() %>% 
  GenomicRanges::reduce() %>% 
  sort() 
db_results <- seq_along(pairs) %>% 
  lapply(
    function(x) {
      tg <- names(pairs)[[x]]
      here::here(
        "output", "differential_binding", tg, 
        glue("{tg}_{pairs[[x]][[1]]}_{pairs[[x]][[2]]}-differential_binding.rds")
      )
    }
  ) %>% 
  lapply(read_rds) %>% 
  lapply(mutate, fdr_mu0 = p.adjust(p_mu0, "fdr")) %>% 
  setNames(comps)
fdr_column <- ifelse(
  "fdr_ihw" %in% colnames(mcols(db_results[[1]])), 
  "fdr_ihw", 
  str_subset("fdr", colnames(mcols(db_results[[1]])))
)
up_col <- function(x) {
  if (is.na(x) | is.nan(x)) return("#ffffff")
  rgb(
    colorRamp(c("#ffffff", colours$direction[["up"]]))(x), maxColorValue = 255
  )
}
down_col <- function(x) {
  if (is.na(x) | is.nan(x)) return("#ffffff")
  rgb(
    colorRamp(c("#ffffff", colours$direction[["down"]]))(x), maxColorValue = 255
  )
}
unch_col <- function(x) {
  if (is.na(x) | is.nan(x)) return("#ffffff")
  rgb(
    colorRamp(c("#ffffff", colours$direction[["unchanged"]]))(x), 
    maxColorValue = 255
  )
}
lfc_col <- function(x){
  if (is.na(x) | is.nan(x)) return("#ffffff")
  rgb(
    colorRamp(c(colours$direction[["down"]], "#ffffff", colours$direction[["up"]]))(x), 
    maxColorValue = 255
  )
}
expr_col <- function(x){
  if (is.na(x) | is.nan(x)) return("#ffffff")
  rgb(colorRamp(hcl.colors(9, "TealRose"))(x), maxColorValue = 255)
}
bar_style <- function(width = 1, fill = "#e6e6e6", height = "75%", align = c("left", "right"), color = NULL) {
  align <- match.arg(align)
  if (align == "left") {
    position <- paste0(width * 100, "%")
    image <- sprintf("linear-gradient(90deg, %1$s %2$s, transparent %2$s)", fill, position)
  } else {
    position <- paste0(100 - width * 100, "%")
    image <- sprintf("linear-gradient(90deg, transparent %1$s, %2$s %1$s)", position, fill)
  }
  list(
    backgroundImage = image,
    backgroundSize = paste("100%", height),
    backgroundRepeat = "no-repeat",
    backgroundPosition = "center",
    color = color
  )
}
with_tooltip <- function(value, width = 30) {
  htmltools::tags$span(title = value, str_trunc(value, width))
}
comp_name <- glue(
  "{targets[[1]]}_{pairs[[1]][[1]]}_{pairs[[1]][[2]]}-", 
  "{targets[[2]]}_{pairs[[2]][[1]]}_{pairs[[2]][[2]]}"
)
fig_path <- here::here("docs", "assets", comp_name)
if (!dir.exists(fig_path)) dir.create(fig_path, recursive = TRUE)
fig_dev <- knitr::opts_chunk$get("dev")
fig_type <- fig_dev[[1]]
if (is.null(fig_type)) stop("Couldn't detect figure type")
fig_fun <- match.fun(fig_type)
if (fig_type %in% c("bmp", "jpeg", "png", "tiff")) {
  ## These figure types require dpi & resetting to be inches
  formals(fig_fun)$units <- "in"
  formals(fig_fun)$res <- 300
}

out_path <- here::here(
  "output", "pairwise_comparisons", paste(targets, collapse = "_")
)
if (!dir.exists(out_path)) dir.create(out_path, recursive = TRUE)
all_out <- list(
  csv = file.path(
    out_path, glue(comp_name, "-pairwise_comparison.csv.gz"
    )
  ),
  de_genes = file.path(
    out_path, 
    glue("{comp_name}-de_genes.csv")
  ),
  goseq = file.path(
    out_path,
    glue("{comp_name}-enrichment.csv")
  ),
  results = file.path(
    out_path, glue(comp_name, "-all_windows.rds")
  ),
  rna_enrichment = file.path(
    out_path,
    glue("{comp_name}-rnaseq_enrichment.csv")
  ),
  renv = here::here(
    "output", "envs",
    glue(comp_name, "-pairwise_comparison.RData")
  )
)
## Empty objects for when RNA-Seq is absent
de_genes_both_comps <- tibble()
cmn_res <- list(tibble(gs_name = character(), leadingEdge = list()))

Outline

This stage of the GRAVI workflow takes the results from two individual differential binding analyses and compares the results, by finding peaks which directly overlap and considering the joint behaviour. In this specific analyses the differential binding response for AR when comparing E2DHT Vs E2, will be integrated with the differential binding response for H3K27ac comparing E2DHT Vs E2. Data will first be described from the perspective of consensus peaks and oracle peaks, before moving onto the differentially bound windows obtained in previous steps.

Integration Of Differential Binding Results

The initial steps of differential binding analysis detect regions with clear changes in binding patterns, however the integration of datasets instead focusses on the correct classification of a given range/window. Differential binding analyses generally focus on controlling the Type II errors (i.e. false discoveries), which comes with a commensurate increase in Type I errors (false negatives). When seeking to correctly classify a window across two datasets, the detection as changed can be taken as a robust finding, whilst the consideration as unchanged may be less robust. When comparing a window across two datasets, a detection as changed within one dataset is taken as primary evidence of ‘something interesting’ occurring, with the secondary consideration being the correct description of the combined binding patterns. For this reason, and to minimise issues with hard thresholding, once a window is considered to be of interest, the p-value used in the second comparison is that obtained (and FDR-adjusted) using the test for \(H_0: \mu = 0\) instead of the initial \(H_0: 0 < |\mu| < \lambda\). The insistence on the more stringent initial inclusion criteria still protects against false discoveries, but this approach instead allows improved classification across both comparisons.

For classification steps within each window, each target is given the status 1) Up, 2) Down, 3) Unchanged, or 4) Undetected. Across two comparisons, this gives 15 possible classifications for each region found with at least one target bound (e.g. Up-Up, Up-Down, Up-Unchanged, Up-Undetected etc.), given that any potential Undetected-Undetected windows will not be included for obvious reasons.

Enrichment Testing

After direct comparison of binding patterns, enrichment analysis is performed in a similar manner to for individual analyses. If RNA-Seq data is provided, genes will be restricted to those present within the RNA-Seq dataset, as representative of detected genes. Enrichment testing is performed on:

  1. All genes mapped to any bound region against those not mapped to a bound region
  2. All genes mapped to both targets against genes not mapped

Comparison To RNA-Seq

If RNA-Seq data is supplied, the sets of genes associated with all binding patterns (e.g. Up-Up, Up-Down etc) were treated as gene-sets and Gene Set Enrichment Analysis (GSEA) (Subramanian et al. 2005) was used to determine enrichment and leading-edge genes associated with altered expression. Genes will be ranked to incorporate direction of change (i.e. Directional GSEA) or significance only (i.e Non-Directional GSEA). The results from enrichment analysis across ChIP target and RNA-Seq data will also be combined to reveal pathways under the regulatory influence in either comparison, for which we also have evidence of changed gene expression.

Peak Comparison

Data was loaded for treatment-agnostic consensus peaks for AR and H3K27ac with treatment-specific oracle peaks (i.e. based on reproducibility across treatment-specific replicates). The sets of macs2-derived peaks for both AR and H3K27ac were first compared using treatment-agnostic peaks for each target. The sets of treatment-specific oracle-peaks were then compared across the combined treatment groups and targets.

Consensus Peaks

fig_name <- targets %>% 
  paste(collapse = "_") %>% 
  paste0("_common_peaks.", fig_type)
lapply(
  consensus_peaks,
  function(x) as.character(subsetByOverlaps(all_consensus, x))
)%>% 
  venn.diagram(
    file.path(fig_path, fig_name),
    imagetype = fig_type,
    units = "in",
    cat.cex = 1.4,
    height = 9, 
    width = 10,
    fill = comp_cols[comps], 
    alpha = 0.3
  )
file.remove(list.files(fig_path, pattern = "log$", full.names = TRUE))

Union of peaks found in AR and H3K27ac, and which of the two targets each peak is found in. Peaks are based on the union of both sets of consensus peaks, which are themselves independent of treatment group.

Treatment Specific Peaks

bar_cols <- treat_colours[rev(unlist(pairs))]
if (length(unique(targets)) == 1)
  bar_cols <- treat_colours[
    pairs %>% 
      unlist() %>% 
      unique() %>% 
      rev()
  ]
samples %>% 
  distinct(target, treat) %>% 
  mutate_all(as.character) %>% 
  unite(group, target, treat, remove = FALSE) %>% 
  split(.$group) %>% 
  lapply(
    function(x) {
      gr <- subsetByOverlaps(all_consensus, oracle_peaks[[x$target]][[x$treat]])
      as.character(gr)
    }
  ) %>% 
  setNames(
    str_replace_all(names(.), "(.+)_(.+)", "\\1 (\\2)")
  ) %>% 
  fromList() %>% 
  upset(
    sets = samples %>%
      distinct(target, treat) %>%
      mutate_all(as.character) %>%
      mutate(grp = glue("{target} ({treat})")) %>%
      pull("grp") %>%
      as.character() %>%
      rev(),
    keep.order = TRUE,
    order.by = "freq",
    set_size.show = TRUE, 
    set_size.scale_max = oracle_peaks %>% 
      lapply(function(x) lapply(x, length)) %>% 
      unlist() %>% 
      max() %>% 
      multiply_by(1.15),
    sets.bar.color = bar_cols
  )
*Macs2 peaks detected for AR and H3K27ac in the treatment groups E2 or E2DHT.*

Macs2 peaks detected for AR and H3K27ac in the treatment groups E2 or E2DHT.

Differentially Bound Windows

lambda <- log2(config$comparisons$fc)
c1_status <- glue("{comps[[1]]}_status")
c1_fdr <- glue("{comps[[1]]}_fdr")
c1_logfc <- glue("{comps[[1]]}_logFC")
c1_logcpm <- glue("{comps[[1]]}_AveExpr")
c1_centre <- glue("{comps[[1]]}_centre")
c2_status <- glue("{comps[[2]]}_status")
c2_fdr <- glue("{comps[[2]]}_fdr")
c2_logfc <- glue("{comps[[2]]}_logFC")
c2_logcpm <- glue("{comps[[2]]}_AveExpr")
c2_centre <- glue("{comps[[2]]}_centre")
all_windows <- db_results %>%
  lapply(granges) %>% 
  GRangesList() %>% 
  unlist()%>%
  sort() %>% 
  GenomicRanges::reduce() %>% 
  mutate(
    region = regions[
      bestOverlap(., GRangesList(lapply(gene_regions, granges)))
    ] %>% 
      factor(levels = regions)
  ) %>%
  join_overlap_left(
    db_results[[comps[[1]]]] %>% 
      mutate(
        status = as.character(status),
        peak_centre = start(
          resize(keyval_range, width = 1, fix = "center")
        )
      ) %>% 
      select(
        !!sym(c1_logcpm) := AveExpr, 
        !!sym(c1_logfc) := logFC, 
        !!sym(c1_status) := status, 
        !!sym(c1_fdr) := !!sym(fdr_column),
        !!sym(glue("{comps[[1]]}_fdr_mu0")) := fdr_mu0,
        !!sym(glue("{comps[[1]]}_centre")) := peak_centre
      )
  ) %>%
  join_overlap_left(
    db_results[[comps[[2]]]] %>% 
      mutate(
        status = as.character(status),
        peak_centre = start(
          resize(keyval_range, width = 1, fix = "center")
        )
      ) %>% 
      select(
        !!sym(c2_logcpm) := AveExpr, 
        !!sym(c2_logfc) := logFC, 
        !!sym(c2_status) := status,
        !!sym(c2_fdr) := !!sym(fdr_column),
        !!sym(glue("{comps[[2]]}_fdr_mu0")) := fdr_mu0,
        !!sym(glue("{comps[[2]]}_centre")) := peak_centre
      )
  ) %>% 
  ## Remove duplicate mappings to each range, just selecting the one with the
  ## lowest FDR
  arrange(!!sym(c2_fdr)) %>% 
  distinctMC(!!sym(c1_fdr), .keep_all = TRUE) %>% 
  arrange(!!sym(c1_fdr)) %>% 
  sort() 
## plyranges::mutate changes the column names using `make.names()`. Avoid!!
## Reclassify windows from the first comparison using data from the second
## to reduce false negatives. If the 2nd comparison contains a significant
## change in binding, reclassify the first using the fdr for H0: mu[0] = 0
## but only where |logFC[est]| > lambda
switch_up <- mcols(all_windows)[[c2_fdr]] < fdr_alpha &
  mcols(all_windows)[[glue("{comps[[1]]}_fdr_mu0")]] < fdr_alpha &
  mcols(all_windows)[[c1_logfc]] > lambda
mcols(all_windows)[[c1_status]][switch_up] <- "Up"
switch_down <- mcols(all_windows)[[c2_fdr]] < fdr_alpha &
  mcols(all_windows)[[glue("{comps[[1]]}_fdr_mu0")]] < fdr_alpha &
  mcols(all_windows)[[c1_logfc]] < (-1) * lambda
mcols(all_windows)[[c1_status]][switch_down] <- "Down"
mcols(all_windows)[[c1_status]] <- mcols(all_windows) %>% 
  .[[c1_status]] %>% 
  factor(levels = str_sep_to_title(names(colours$direction))) %>% 
  fct_explicit_na("Undetected") %>% 
  fct_relabel(function(x) paste(comps[[1]], x))  
## Now reclassify the second comparison using changed windows in the first
switch_up <- mcols(all_windows)[[c1_fdr]] < fdr_alpha &
  mcols(all_windows)[[glue("{comps[[2]]}_fdr_mu0")]] < fdr_alpha &
  mcols(all_windows)[[c2_logfc]] > lambda
mcols(all_windows)[[c2_status]][switch_up] <- "Up"
switch_down <- mcols(all_windows)[[c1_fdr]] < fdr_alpha &
  mcols(all_windows)[[glue("{comps[[2]]}_fdr_mu0")]] < fdr_alpha &
  mcols(all_windows)[[c2_logfc]] < (-1) * lambda
mcols(all_windows)[[c2_status]][switch_down] <- "Down"
mcols(all_windows)[[c2_status]] <- mcols(all_windows) %>% 
  .[[c2_status]] %>% 
  factor(levels = str_sep_to_title(names(colours$direction))) %>% 
  fct_explicit_na("Undetected") %>% 
  fct_relabel(function(x) paste(comps[[2]], x))  
mcols(all_windows)[["status"]] <- paste(
  mcols(all_windows)[[c1_status]],
  mcols(all_windows)[[c2_status]],
  sep = " - "
) %>% 
  factor(levels = combs) %>% 
  droplevels()
mcols(all_windows)[["distance"]] <- abs(
  mcols(all_windows)[[glue("{comps[[1]]}_centre")]] - mcols(all_windows)[[glue("{comps[[2]]}_centre")]]
)
## Remove any with an 'ambiguous' status
all_windows <- subset(all_windows, !is.na(status))

if (has_features) 
  all_windows$feature <- bestOverlap(
    all_windows, external_features, var = "feature", missing = "no_feature"
  ) %>% 
  str_sep_to_title()

all_windows$hic <- NA
if (has_hic)
  all_windows$hic <- overlapsAny(all_windows, hic)

## Define promoters for mapping
feat_prom <- feat_enh <- GRanges()
if (has_features) {
  feat_prom <- ifelse(
    "feature" %in% colnames(mcols(external_features)), 
    list(granges(subset(external_features, str_detect(feature, "[Pp]rom")))), 
    list(GRanges())
  )[[1]]
  feat_enh <- ifelse(
    "feature" %in% colnames(mcols(external_features)), 
    list(granges(subset(external_features, str_detect(feature, "[Ee]nhanc")))), 
    list(GRanges())
  )[[1]]
}
prom4mapping <- GRangesList(feat_prom, granges(gene_regions$promoters)) %>%
  unlist() %>%
  GenomicRanges::reduce()

all_windows <- all_windows %>% 
  select(any_of(c("region", "feature", "hic")), everything()) %>% 
  mapByFeature(
    all_gr$gene, prom = prom4mapping, enh = feat_enh, gi = hic,
    gr2gene = params$mapping$gr2gene, 
    prom2gene = params$mapping$prom2gene,
    enh2gene = params$mapping$enh2gene, gi2gene = params$mapping$gi2gene
  ) 
all_windows$detected <- all_windows$gene_name
if (has_rnaseq) all_windows$detected <- endoapply(
  all_windows$detected, intersect, rnaseq$gene_name
) 
n_mapped <- all_windows %>%
  select(status, gene_id) %>% 
  as_tibble() %>% 
  unnest(everything()) %>% 
  distinct(status, gene_id) %>% 
  group_by(status) %>% 
  summarise(mapped = dplyr::n()) %>% 
  arrange(desc(mapped)) 

A set of 51,077 common windows was formed by obtaining the union of windows produced during the analysis of AR and H3K27ac individually. These common windows were formed using any overlap between target-specific windows. The window-to-gene mappings and the best overlap with genomic regions was also reformed.

In the original datasets, 17,043 windows were included for AR, whilst 41,089 windows were included for H3K27ac. The distance between each pair of windows was found by taking the centre of the original sliding window used for statistical analysis, and finding the difference. As windows were detected in an unstranded manner, all distance values were set to be positive. Taking the complete set of 51,077 windows, 6,888 (13.5%) were considered as being detected in both datasets. The median distance between windows found in both comparisons was 400bp. 19 peaks (0.3%) were found to directly overlap.

Merged windows were compared across both targets and in order to obtain the best classification for each window, the initial values of an FDR-adjusted p-value < 0.05 from either comparison were used to consider a window as showing changed binding in at least one comparison. A secondary step was then introduced for each window such that if one target was significant and the other target had 1) received an FDR-adjusted p-value < 0.05 using a point-based \(H_0\), and 2) had an estimated logFC beyond the range specified for the range-based null hypothesis (\(H_0: 0 < |\mu| < \lambda\) for \(\lambda = 0.26\)), these windows were then considered significant in the second comparison. This was to help reduce the number of windows incorrectly classified as unchanged whilst the alternative ChIP target was defined as changed.

Whilst summary tables and figures are presented below, the full set of windows and mapped genes were exported as output/pairwise_comparisons/AR_H3K27ac/AR_E2_E2DHT-H3K27ac_E2_E2DHT-pairwise_comparison.csv.gz

cp <- htmltools::tags$em(
  glue(
    "Summary of combined binding windows for ",
    glue_collapse(comps, sep = " and "),
    ". Windows were classified as described above. ",
    "The number of windows showing each combined pattern are given, along ",
    "with the number of unique genes mapped to each set of combined windows, ",
    "noting that some genes will be mapped to multiple sets of windows. ",
    "The % of all genes with a window mapped from either target is given in ",
    "the final column."
  )
)
tbl <- all_windows %>% 
  as_tibble() %>% 
  rename_all(
    str_replace_all, pattern = "\\.Vs\\.\\.", replacement = " Vs. "
  ) %>% 
  rename_all(
    str_replace_all, pattern = "\\.\\.", replacement = ": "
  ) %>% 
  group_by(across(ends_with("_status"))) %>% 
  summarise(
    windows = dplyr::n(),
    mapped = length(unique(unlist(gene_id))),
    .groups = "drop"
  ) %>% 
  mutate(
    `% genes` = mapped / length(unique(unlist(all_windows$gene_id)))
  ) %>% 
  reactable(
    filterable = TRUE,
    showPageSizeOptions = TRUE,
    columns = list2(
      !!sym(c1_status) := colDef(
        name = comps[[1]],
        cell = function(value) {
          value <- str_extract(value, "Up|Down|Unchanged|Undetected")
          if (str_detect(value, "Up")) value <- paste(value, "\u21E7")
          if (str_detect(value, "Down")) value <- paste(value, "\u21E9")
          paste(targets[[1]], value)
        },
        style = function(value) {
          colour <- case_when(
            str_detect(value, "Up") ~ colours$direction[["up"]],
            str_detect(value, "Down") ~ colours$direction[["down"]],
            str_detect(value, "Unchanged") ~ colours$direction[["unchanged"]],
            str_detect(value, "Undetected") ~ colours$direction[["undetected"]]
          )
          list(color = colour)
        },
        footer = htmltools::tags$b("Total")
      ),
      !!sym(c2_status) := colDef(
        name = comps[[2]],
        cell = function(value) {
          value <- str_extract(value, "Up|Down|Unchanged|Undetected")
          if (str_detect(value, "Up")) value <- paste(value, "\u21E7")
          if (str_detect(value, "Down")) value <- paste(value, "\u21E9")
          paste(targets[[2]], value)
        },
        style = function(value) {
          colour <- case_when(
            str_detect(value, "Up") ~ colours$direction[["up"]],
            str_detect(value, "Down") ~ colours$direction[["down"]],
            str_detect(value, "Unchanged") ~ colours$direction[["unchanged"]],
            str_detect(value, "Undetected") ~ colours$direction[["undetected"]]
          )
          list(color = colour)
        }
      ),
      windows = colDef(
        name = "Number of Windows", 
        cell = function(value) comma(value, 1),
        style = function(value) {
          bar_style(width = 0.9*value / length(all_windows), fill = "#B3B3B3")
        },
        footer = htmltools::tags$b(comma(length(all_windows), 1))
      ),
      mapped = colDef(
        name = "Mapped Genes", 
        cell = function(value) comma(value, 1),
        style = function(value) {
          bar_style(width = 0.9*value /length(unique(unlist(all_windows$gene_id))), fill = "#B3B3B3")
        },
        footer = htmltools::tags$b(comma(length(unique(unlist(all_windows$gene_id))), 1))
      ),
      `% genes` = colDef(
        name = "% Of All Mapped Genes",
        cell = function(x) percent(x, 0.1),
        style = function(value) {
          bar_style(width = 0.9*value, fill = "#B3B3B3")
        }
      )
    )
  )
div(class = "table",
    div(class = "table-header",
        div(class = "caption", cp),
        tbl
    )
)
Summary of combined binding windows for AR and H3K27ac. Windows were classified as described above. The number of windows showing each combined pattern are given, along with the number of unique genes mapped to each set of combined windows, noting that some genes will be mapped to multiple sets of windows. The % of all genes with a window mapped from either target is given in the final column.

Pairwise Differential Binding

df <- all_windows %>% 
  as_tibble() %>% 
  rename_all(
    str_replace_all, pattern = "\\.Vs\\.\\.", replacement = " Vs. "
  ) %>% 
  rename_all(
    str_replace_all, pattern = "\\.\\.", replacement = ": "
  ) %>% 
  dplyr::filter(!str_detect(status, "Undetected")) %>% 
  droplevels()
df %>% 
  ggplot(
    aes(
      !!sym(c1_logfc), !!sym(c2_logfc),
      colour = status
    )
  ) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 0) +
  geom_hline(
    yintercept = c(1, -1) * lambda, 
    linetype = 2, colour = "grey"
  ) +
  geom_vline(xintercept = 0) +
  geom_vline(
    xintercept = c(1, -1)* lambda,
    linetype = 2, colour = "grey"
  ) +
  geom_text_repel(
    aes(label = lab),
    data = . %>% 
      dplyr::filter(
        !str_detect(status, "Unchanged.+Unchanged"),
        vapply(detected, length, integer(1)) > 0
      ) %>% 
      mutate(
        total_lfc = abs(!!sym(c1_logfc)) + abs(!!sym(c2_logfc)) 
      ) %>% 
      group_by(status) %>%
      dplyr::filter(total_lfc == max(total_lfc)) %>% 
      mutate(
        detected = vapply(detected, paste, character(1), collapse = "; "),
        lab = paste(status, detected, sep = ": ") %>% str_wrap(25)
      ),
    size = 4, 
    bg.r = 0.001, bg.color = "grey70",
    show.legend = FALSE
  ) +
  geom_text(
    aes(x, y, label = lab),
    data = . %>%
      mutate(
        x = 0.9*max((!!sym(c1_logfc))),
        y = min(!!sym(c2_logfc))
      ) %>% 
      summarise(
        x = unique(x), 
        y = unique(y),
        n = dplyr::n(),
        lab = paste("n =", comma(n, 1)),
        .groups = "drop"
      ),
    inherit.aes = FALSE
  ) +
  scale_colour_manual(values = direction_colours) +
  labs(
    x = paste(comps[[1]], "logFC"),
    y = paste(comps[[2]], "logFC"),
    colour = "Status"
  ) +
  theme(legend.position = "top")
*Comparative changes in both AR and H3K27ac. The window with most extreme combined change in binding is shown with any mapped genes labelled. The range around zero used for range-based hypothesis testing is indicated with grey dashed lines.*

Comparative changes in both AR and H3K27ac. The window with most extreme combined change in binding is shown with any mapped genes labelled. The range around zero used for range-based hypothesis testing is indicated with grey dashed lines.

logFC By Region

df <- all_windows %>% 
  as_tibble() %>% 
  rename_all(
    str_replace_all, pattern = "\\.Vs\\.\\.", replacement = " Vs. "
  ) %>% 
  rename_all(
    str_replace_all, pattern = "\\.\\.", replacement = ": "
  ) %>% 
  dplyr::filter(!str_detect(status, "Undetected")) %>% 
  droplevels() 
df %>% 
  ggplot(
    aes(
      !!sym(c1_logfc), !!sym(c2_logfc),
      colour = status
    )
  ) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 0) +
  geom_hline(
    yintercept = c(1, -1) * lambda,
    linetype = 2, colour = "grey"
  ) +
  geom_vline(xintercept = 0) +
  geom_vline(
    xintercept = c(1, -1) * lambda, 
    linetype = 2, colour = "grey"
  ) +
  geom_text_repel(
    aes(label = detected),
    data = . %>% 
      dplyr::filter(
        !str_detect(status, "Unchanged.+Unchanged"),
        vapply(detected, length, integer(1)) > 0
      ) %>% 
      mutate(
        total_lfc = abs(!!sym(c1_logfc)) + abs(!!sym(c2_logfc)) 
      ) %>% 
      group_by(region) %>% 
      dplyr::filter(total_lfc == max(total_lfc)) %>% 
      mutate(
        detected = vapply(detected, paste, character(1), collapse = "; ") %>% 
          str_wrap(25)
      ),
    size = 3, bg.color = "grey70", bg.r = 0.001,
    show.legend = FALSE
  ) +
  geom_text(
    aes(x, y, label = lab),
    data = . %>%
      mutate(
        x = 0.9*max(!!sym(c1_logfc)),
        y = min(!!sym(c2_logfc))
      ) %>% 
      group_by(region) %>%
      summarise(
        x = unique(x), 
        y = unique(y),
        n = dplyr::n(),
        lab = paste("n =", comma(n, 1)),
        .groups = "drop"
      ),
    inherit.aes = FALSE
  ) +
  facet_wrap(~region) +
  scale_colour_manual(values = direction_colours) +
  labs(
    x = paste(comps[[1]], "logFC"),
    y = paste(comps[[2]], "logFC"),
    colour = "Status"
  ) +
  theme(legend.position = "top")
*Comparative changes in both AR and H3K27ac. The window with most extreme combined change in binding for each separate region is shown with any mapped genes labelled. The range around zero used for range-based hypothesis testing is indicated with grey dashed lines.*

Comparative changes in both AR and H3K27ac. The window with most extreme combined change in binding for each separate region is shown with any mapped genes labelled. The range around zero used for range-based hypothesis testing is indicated with grey dashed lines.

AR By H3K27ac Status

all_windows %>%
  as_tibble() %>% 
  rename_all(
    str_replace_all, pattern = "\\.Vs\\.\\.", replacement = " Vs. "
  ) %>% 
  rename_all(
    str_replace_all, pattern = "\\.\\.", replacement = ": "
  ) %>% 
  dplyr::filter(
    !!sym(c1_status) != paste(comps[[1]], "Undetected")
  ) %>% 
  mutate(
    !!sym(c1_status) := fct_relabel(
      !!sym(c1_status), 
      str_extract, 
      pattern = "Up|Down|Unchanged|Undetected"
    ),
    !!sym(c2_status) := fct_relabel(
      !!sym(c2_status), 
      str_extract, 
      pattern = "Up|Down|Unchanged|Undetected"
    )
  ) %>% 
  droplevels() %>% 
  ggplot(
    aes(
      !!sym(c1_logcpm), 
      !!sym(c1_logfc),
      colour = !!sym(c1_status), 
      fill = !!sym(c2_status)
    )
  ) +
  geom_point(alpha = 0.6) +
  geom_xsideboxplot(
    aes(y = !!sym(c2_status)),
    orientation = "y", colour = "black"
  ) +
  geom_ysideboxplot(
    aes(x = !!sym(c2_status)),
    orientation = "x", colour = "black"
  ) +
  geom_ysideline(
    aes(x, y),
    data = tibble(x = seq(0, 5), y = 0),
    inherit.aes = FALSE
  ) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = c(1, -1) * lambda, linetype = 2, colour = "grey30") +
  scale_xsidey_discrete() +
  scale_ysidex_discrete(guide = guide_axis(angle = 90)) +
  scale_colour_manual(
    values = colours$direction %>%
      setNames(str_to_title(names(.))) %>% 
      .[!str_detect(names(.), "Undetected")]
  ) +
  scale_fill_manual(
    values = colours$direction %>%
      setNames(str_to_title(names(.)))
  ) +
  labs(
    x = paste(comps[[1]], "logCPM"),
    y = paste(comps[[1]], "logFC"),
    colour = str_replace(comps[[1]], ": ", "\n"),
    fill = str_replace_all(comps[[2]], ": ", "\n")
  ) +
  theme(
    ggside.panel.scale.y = .2,
    ggside.panel.scale.x = .25,
    axis.title.x = element_text(hjust = 0.4, vjust = 1),
    axis.title.y = element_text(hjust = 0.4, vjust = 1)
  )
*Differential binding patterns of AR by H3K27ac status. AR values are shown as points coloured by their individual classification as Up, Down or Unchanged, with the distributions of these points shown as boxplots by H3K27ac status in the side panels.*

Differential binding patterns of AR by H3K27ac status. AR values are shown as points coloured by their individual classification as Up, Down or Unchanged, with the distributions of these points shown as boxplots by H3K27ac status in the side panels.

H3K27ac By AR Status

all_windows %>%
  as_tibble() %>% 
  rename_all(
    str_replace_all, pattern = "\\.Vs\\.\\.", replacement = " Vs. "
  ) %>% 
  rename_all(
    str_replace_all, pattern = "\\.\\.", replacement = ": "
  ) %>% 
  as_tibble() %>% 
  dplyr::filter(
    !!sym(c2_status) != paste(comps[[2]], "Undetected")
  ) %>% 
  mutate(
    !!sym(c1_status) := fct_relabel(
      !!sym(c1_status), 
      str_extract, 
      pattern = "Up|Down|Unchanged|Undetected"
    ),
    !!sym(c2_status) := fct_relabel(
      !!sym(c2_status), 
      str_extract, 
      pattern = "Up|Down|Unchanged|Undetected"
    )
  ) %>% 
  droplevels() %>% 
  ggplot(
    aes(
      !!sym(c2_logcpm), 
      !!sym(c2_logfc),
      colour = !!sym(c2_status), 
      fill = !!sym(c1_status)
    )
  ) +
  geom_point(alpha = 0.6) +
  geom_xsideboxplot(
    aes(y = !!sym(c1_status)),
    orientation = "y", colour = "black"
  ) +
  geom_ysideboxplot(
    aes(x = !!sym(c1_status)),
    orientation = "x", colour = "black"
  ) +
  geom_ysideline(
    aes(x, y),
    data = tibble(x = seq(0, 5), y = 0),
    inherit.aes = FALSE
  ) +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = c(1, -1) * lambda, linetype = 2, colour = "grey30") +
  scale_xsidey_discrete() +
  scale_ysidex_discrete(guide = guide_axis(angle = 90)) +
  scale_colour_manual(
    values = colours$direction %>%
      setNames(str_to_title(names(.))) %>% 
      .[!str_detect(names(.), "Undetected")]
  ) +
  scale_fill_manual(
    values = colours$direction %>%
      setNames(str_to_title(names(.)))
  ) +
  labs(
    x = paste(comps[[2]], "logCPM"),
    y = paste(comps[[2]], "logFC"),
    colour = str_replace(comps[[2]], ": ", "\n"),
    fill = str_replace_all(comps[[1]], ": ", "\n")
  ) +
  theme(
    ggside.panel.scale.y = .2,
    ggside.panel.scale.x = .25,
    axis.title.x = element_text(hjust = 0.4, vjust = 1),
    axis.title.y = element_text(hjust = 0.4, vjust = 1)
  )
*Differential binding patterns of H3K27ac by AR status. H3K27ac values are shown as points coloured by their individual classification as Up, Down or Unchanged, with the distributions of these shown as boxplots by AR status in the side panels.*

Differential binding patterns of H3K27ac by AR status. H3K27ac values are shown as points coloured by their individual classification as Up, Down or Unchanged, with the distributions of these shown as boxplots by AR status in the side panels.

Distribution By Region

all_windows$grp <- paste(
  mcols(all_windows)[[c1_status]], 
  mcols(all_windows)[[c2_status]], 
  all_windows$region
)
df <- all_windows %>% 
  split(.$grp) %>% 
  lapply(
    function(x) {
      tibble(
        "{c1_status}" := unique(mcols(x)[[c1_status]]),
        "{c2_status}" := unique(mcols(x)[[c2_status]]),
        region = unique(mcols(x)[["region"]]),
        n = length(x)
      )
    }
  ) %>% 
  bind_rows() %>% 
  dplyr::filter(
    str_detect(!!sym(c1_status), "Up|Down") | str_detect(!!sym(c2_status), "Up|Down")
  ) %>% 
  group_by(!!sym(c1_status), !!sym(c2_status)) %>% 
  mutate(total = sum(n)) %>% 
  ungroup() %>% 
  arrange(!!sym(c1_status), !!sym(c2_status), region) %>% 
  mutate(
    x0 = as.integer(!!sym(c1_status)),
    y0 = as.integer(!!sym(c2_status)),
    r = total / sum(total),
    r =  0.5* r / max(r),
    !!sym(c1_status) := fct_relabel(!!sym(c1_status), str_extract, pattern = "Up|Down|Un.+"),
    !!sym(c2_status) := fct_relabel(!!sym(c2_status), str_extract, pattern = "Up|Down|Un.+")
  )
df %>% 
  ggplot() +
  ggforce::stat_pie(
    aes(x0 = x0, y0 = y0, r0 = 0, r = r, fill = region, amount = n)
  ) +
  geom_label(
    aes(x0, y0, label = comma(total)),
    data = . %>% 
      distinct(x0, y0, total) %>% 
      dplyr::filter(total > 0.05 * sum(total)),
    alpha = 0.8, size = 4
  ) +
  coord_equal() +
  scale_fill_manual(values = region_colours) +
  scale_x_continuous(
    breaks = seq_along(levels(df[[c1_status]])), 
    labels = levels(df[[c1_status]])
  ) +
  scale_y_continuous(
    breaks = seq_along(levels(df[[c2_status]])), 
    labels = levels(df[[c2_status]])
  ) +
  labs(
    x = paste(comps[[1]], "Status"), 
    y = paste(comps[[2]], "Status"), 
    fill = "Region"
  ) +
  theme(
    panel.grid = element_blank()
  )
*Distribution of binding patterns by genomic regions as defined in earlier sections. Windows are only shown if change was detected in one of the comparisons.*

Distribution of binding patterns by genomic regions as defined in earlier sections. Windows are only shown if change was detected in one of the comparisons.

Distance By Status

all_windows %>% 
  as_tibble() %>% 
  dplyr::filter(!is.na(distance)) %>% 
  arrange(status, distance) %>% 
  group_by(status) %>% 
  mutate(
    n = dplyr::n(),
    q = seq_along(status) / n
  ) %>% 
  dplyr::filter(distance < 1e3, !str_detect(status, "Unchanged.+Unchanged")) %>% 
  ggplot(aes(distance, q, colour = status)) +
  geom_line(linewidth = 1) +
  scale_x_continuous(breaks = seq(0, 10, by = 2) * 100) +
  scale_y_continuous(
    labels = percent, expand = expansion(c(0, 0.05)), 
    breaks = seq(0, 1, by = 0.2)
  ) +
  scale_colour_manual(
    values = direction_colours
  ) +
  labs(
    x = "Distance Between Window Centres (bp)",
    y = "Percentile",
    colour = "Combined Status"
  )
*Distances between windows where maximal signal was detected for each target. Windows are only shown if change was detected in one or more comparisons. The x-axis is truncated at 1kb*

Distances between windows where maximal signal was detected for each target. Windows are only shown if change was detected in one or more comparisons. The x-axis is truncated at 1kb

Distance by Region

all_windows %>% 
  as_tibble() %>% 
  dplyr::filter(!is.na(distance)) %>% 
  arrange(region, distance) %>% 
  group_by(region) %>% 
  mutate(
    n = dplyr::n(),
    q = seq_along(region) / n
  ) %>% 
  dplyr::filter(distance < 1e3) %>% 
  ggplot(aes(distance, q, colour = region)) +
  geom_line(linewidth = 1) +
  scale_x_continuous(breaks = seq(0, 10, by = 2) * 100) +
  scale_y_continuous(
    labels = percent, expand = expansion(c(0, 0.05)), 
    breaks = seq(0, 1, by = 0.2)
  ) +
  scale_colour_manual(values = region_colours) +
  labs(
    x = "Distance Between Window Centres (bp)",
    y = "Percentile",
    colour = "Genomic Region"
  )
*Distribution of distances between peaks separated by genomic region. All sites are included regardless of changed binding. The x-axis is truncated 1kb.*

Distribution of distances between peaks separated by genomic region. All sites are included regardless of changed binding. The x-axis is truncated 1kb.

Combined Binding

Heatmaps

A series of profile heatmaps were generated for any set of pairwise comparisons with more than 25 ranges, and where binding was detected as changed in at least one of the comparisons. If comparing two targets, peaks were centred at the mid-point between the ranges of individual maximal signal. If only one target is detected within a range, peaks are centred at the range with maximal signal. Scales for fill and average signal (both logCPM) are held constant across all plots for easier comparison between groups.

## Set the BWFL as a single object incorporating target & treatment names
fl <- seq_along(pairs) %>% 
  lapply(
    function(x) {
      file.path(
        macs2_path[[x]], glue("{pairs[[x]]}_merged_treat_pileup.bw")
      ) %>% 
        setNames(paste(names(pairs)[[x]], pairs[[x]]))
    } 
  )%>% 
  unlist() %>% 
  .[!duplicated(.)]
bwfl <- setNames(BigWigFileList(fl), names(fl))
grl_to_plot <- all_windows %>% 
  split(.$status) %>% 
  .[str_detect(names(.),"Up|Down")] %>% 
  .[vapply(., length, integer(1)) > 25] %>% # Only show groups with > 10 ranges
  lapply(
    mutate, 
    centre = rowMeans(cbind(!!sym(c1_centre), !!sym(c2_centre)), na.rm = TRUE),
    rng = paste0(seqnames, ":", as.integer(centre))
  ) %>% 
  lapply(function(x) GRanges(x$rng, seqinfo = sq)) %>% 
  GRangesList()
profile_width <- 5e3
x_lab <- c(
  seq(0, floor(0.5 * profile_width / 1e3), length.out = 3),
  seq(-floor(0.5 * profile_width / 1e3), 0, length.out = 3)
) %>% 
  sort() %>% 
  unique()
n_bins <- 100
sig_profiles <- mclapply(
  grl_to_plot, 
  function(x) getProfileData(
    bwfl, x, upstream = profile_width / 2, bins = n_bins
  ),
  mc.cores = min(length(grl_to_plot), threads)
)
profile_heatmaps <- sig_profiles %>% 
  mclapply(
    plotProfileHeatmap,
    profileCol = "profile_data", 
    xLab = "Distance from Centre (bp)",
    fillLab = "logCPM",
    colour = "name",
    mc.cores = min(length(grl_to_plot), threads)
  ) 
fill_range <- profile_heatmaps %>% 
  lapply(function(x) x$data[,"score"]) %>% 
  unlist() %>%
  c(0) %>% 
  range()
sidey_range <- profile_heatmaps %>% 
  lapply(function(x) x$layers[[3]]$data$y) %>% 
  unlist() %>% 
  range()
profile_heatmaps <- profile_heatmaps %>% 
  lapply(
    function(x) {
      x + 
        scale_x_continuous(
          breaks = x_lab * 1e3, labels = x_lab, expand = expansion(0)
        ) +
        scale_xsidey_continuous(limits = sidey_range) +
        scale_fill_gradientn(colours = colours$heatmaps, limits = fill_range) +
        scale_colour_manual(
          values = lapply(
            seq_along(pairs), 
            function(i) setNames(
              treat_colours[pairs[[i]]], 
              paste(names(pairs)[[i]], pairs[[i]])
            )
          ) %>%
            unlist() %>% 
            .[!duplicated(names(.))]
        ) +
        labs(
          x = "Distance from Centre (kb)",
          fill = "logCPM", linetype = "Consensus\nPeak\nOverlap",
          colour = "Sample"
        ) 
    }
  )
htmltools::tagList(
  mclapply(
    seq_along(profile_heatmaps),
    function(x) {
      ## Export the image
      pw <- names(profile_heatmaps)[[x]] 
      img_out <- file.path(
        fig_path,
        pw %>% 
          str_remove_all("(:|Vs. |- )") %>% 
          str_replace_all(" ", "_") %>% 
          str_replace_all("_-_", "-") %>% 
          paste0("_profile_heatmap.", fig_type)
      )
      n_ranges <- length(unique(profile_heatmaps[[x]]$data$range))
      n_samples<- length(unique(profile_heatmaps[[x]]$data$name))
      fig_fun(
        filename = img_out,
        width = knitr::opts_current$get("fig.width") * n_samples / 4, 
        height = min(
          3.5 + knitr::opts_current$get("fig.height") * n_ranges / 1.5e3, 
          10
        )
      )
      print(profile_heatmaps[[x]])
      dev.off()
      ## Create html tags
      fig_link <- str_extract(img_out, "assets.+")
      cp <- htmltools::tags$em(
        glue(
          "
        All {n_ranges} ranges showing the pattern of {pw}. Ranges are centered 
        at the midpoint between ranges identified as the maximal signal range.
        "
        )
      )
      htmltools::div(
        htmltools::div(
          id = img_out %>% 
            basename() %>% 
            str_remove_all(glue(".{fig_type}$")) %>% 
            str_to_lower() %>% 
            str_replace_all("_", "-"),
          class="section level3",
          htmltools::h3(pw),
          htmltools::div(
            class = "figure", style = "text-align: center",
            htmltools::img(src = fig_link, width = "100%"),
            htmltools::tags$caption(cp)
          )
        )
      )
    },
    mc.cores = threads
  )
)

AR Up - H3K27ac Up

All 501 ranges showing the pattern of AR Up - H3K27ac Up. Ranges are centered at the midpoint between ranges identified as the maximal signal range.

AR Up - H3K27ac Unchanged

All 3910 ranges showing the pattern of AR Up - H3K27ac Unchanged. Ranges are centered at the midpoint between ranges identified as the maximal signal range.

AR Up - H3K27ac Undetected

All 7057 ranges showing the pattern of AR Up - H3K27ac Undetected. Ranges are centered at the midpoint between ranges identified as the maximal signal range.

Windows With The Strongest Signal

For the initial set of visualisations, the sets of windows with the strongest signal were selected from within each group. Groups were restricted to those where differential binding was seen in at least comparison.

grl_to_plot <- all_windows %>% 
  filter(
    !str_detect(status, "Un.+Un"),
    vapply(detected, length, integer(1)) > 0
  ) %>% 
  as_tibble() %>% 
  rename_all(
    str_replace_all, pattern = "\\.Vs\\.\\.", replacement = " Vs. "
  ) %>% 
  rename_all(
    str_replace_all, pattern = "\\.\\.", replacement = ": "
  ) %>% 
  nest(
    AveExpr = ends_with("AveExpr")
  ) %>% 
  mutate(
    max_signal = vapply(AveExpr, function(x) max(unlist(x)), numeric(1))
  ) %>% 
  group_by(status) %>% 
  filter(max_signal == max(max_signal)) %>% 
  dplyr::select(
    range, status, ends_with("logFC")
  ) %>% 
  droplevels() %>% 
  split(.$status) %>% 
  lapply(pull, "range") %>% 
  lapply(function(x) x[1]) %>% 
  lapply(GRanges, seqinfo = sq) %>% 
  GRangesList()
## The coverage
bwfl <- seq_along(pairs) %>% 
  lapply(
    function(x) {
      file.path(
        macs2_path[[x]], glue("{pairs[[x]]}_merged_treat_pileup.bw")
      ) %>% 
        BigWigFileList() %>% 
        setNames(pairs[[x]])
    } 
  ) %>% 
  setNames(comps)
line_col <- lapply(bwfl, function(x) treat_colours[names(x)])

## Coverage annotations
annot <- comps %>% 
  lapply(
    function(x) {
      col <- glue("{x}_status")
      select(all_windows, all_of(col)) %>% 
        mutate(
          status = str_extract(!!sym(col), "Down|Up|Unchanged|Undetected")
        ) %>% 
        splitAsList(.$status) %>% 
        lapply(granges) %>% 
        lapply(unique) %>% 
        GRangesList()
    }
  ) %>% 
  setNames(comps)
annot_col <- unlist(colours$direction) %>% 
  setNames(str_to_title(names(.)))



## Coverage y-limits
gr <- unlist(grl_to_plot)
y_lim <- lapply(
  bwfl,
  function(x) {
    cov <- lapply(x, import.bw, which = gr)
    unlist(lapply(cov, function(y) max(y$score))) %>% 
      c(0) %>% 
      range()
  }
)

## The features track
feat_gr <- gene_regions %>% 
  lapply(granges) %>% 
  GRangesList()
feature_colours <- colours$regions
if (has_features) {
  feat_gr <- list(Regions = feat_gr)
  feat_gr$Features <- splitAsList(external_features, external_features$feature)
  feature_colours <- list(
    Regions = unlist(colours$regions),
    Features = unlist(colours$features)
  )
}

## The genes track
hfgc_genes <- trans_models
gene_col <- "grey"
if (has_rnaseq) {
  rna_lfc_col <- colnames(rnaseq)[str_detect(str_to_lower(colnames(rnaseq)), "logfc")][1]
  rna_fdr_col <- colnames(rnaseq)[str_detect(str_to_lower(colnames(rnaseq)), "fdr|adjp")][1]
  if (!is.na(rna_lfc_col) & !is.na(rna_fdr_col)) {
    hfgc_genes <- trans_models %>% 
      mutate(
        status = case_when(
          !gene %in% rnaseq$gene_id ~ "Undetected",
          gene %in% dplyr::filter(
            rnaseq, !!sym(rna_lfc_col) > 0, !!sym(rna_fdr_col) < fdr_alpha
          )$gene_id ~ "Up",
          gene %in% dplyr::filter(
            rnaseq, !!sym(rna_lfc_col) < 0, !!sym(rna_fdr_col) < fdr_alpha
          )$gene_id ~ "Down",
          gene %in% dplyr::filter(
            rnaseq, !!sym(rna_fdr_col) >= fdr_alpha
          )$gene_id ~ "Unchanged",
        )
      ) %>% 
      splitAsList(.$status) %>% 
      lapply(select, -status) %>% 
      GRangesList()
    gene_col <- colours$direction %>% 
      setNames(str_to_title(names(.)))
  }
}

## External Coverage (Optional)
if (!is.null(config$external$coverage)) {
  ext_cov_path <- config$external$coverage %>% 
    lapply(unlist) %>% 
    lapply(function(x) setNames(here::here(x), names(x)))
  bwfl <- c(
    bwfl[comps],
    lapply(ext_cov_path, function(x) BigWigFileList(x) %>% setNames(names(x)))
  )
  line_col <- c(
    line_col[comps],
    ext_cov_path %>% 
      lapply(
        function(x) {
          missing <- setdiff(names(x), names(treat_colours))
          cmn <- intersect(names(x), names(treat_colours))
          col <- setNames(character(length(names(x))), names(x))
          if (length(cmn) > 0) col[cmn] <- treat_colours[cmn]
          if (length(missing) > 0) 
            col[missing] <- hcl.colors(
              max(5, length(missing)), "Zissou 1")[seq_along(missing)]
          col
        }
      )
  )
  
  y_ranges <- grl_to_plot %>% 
    unlist() %>% 
    granges() %>% 
    resize(w = 10 * width(.), fix = 'center')
  
  y_lim <- c(
    y_lim[comps],
    bwfl[names(config$external$coverage)] %>% 
      lapply(
        function(x) {
          GRangesList(lapply(x, import.bw, which = y_ranges)) %>% 
            unlist() %>% 
            filter(score == max(score)) %>% 
            mcols() %>% 
            .[["score"]] %>% 
            c(0) %>% 
            range()
        }
      )
  )
  
}
htmltools::tagList(
  mclapply(
    seq_along(grl_to_plot),
    function(x) {
      ## Export the image
      img_out <- file.path(
        fig_path,
        names(grl_to_plot)[[x]] %>% 
          str_remove_all("(:|Vs. |- )") %>% 
          str_replace_all(" ", "_") %>% 
          paste0("_AveExpr.", fig_type)
      )
      fig_fun(
        filename = img_out, width = 10, height = 8
      )
      ct <- FALSE
      if (length(subsetByOverlaps(all_gr$transcript, grl_to_plot[[x]])) > 10)
        ct <- "meta"
      plotHFGC(
        grl_to_plot[[x]],
        features = feat_gr, featcol = feature_colours, 
        featsize = 1 + has_features,
        genes = hfgc_genes, genecol = gene_col,
        coverage = bwfl, linecol = line_col,
        annotation = annot, annotcol = annot_col,
        cytobands = bands_df,
        rotation.title = 90,
        zoom = 10,
        ylim = y_lim,
        collapseTranscripts = ct,
        col.title = "black", background.title = "white", showAxis = FALSE
      )
      dev.off()
      ## Create html tags
      fig_link <- str_extract(img_out, "assets.+")
      gn <- unlist(subsetByOverlaps(all_windows, grl_to_plot[[x]])$detected)
      cp <- htmltools::tags$em(
        glue(
          "
        Highlighted region corresponds to the highest signal within the group 
        {names(grl_to_plot)[[x]]}. The most likely target 
        {ifelse(length(gn) == 1, 'gene is ', 'genes are ')}
        {collapseGenes(gn, format = '')}
        "
        )
      )
      htmltools::div(
        htmltools::div(
          id = names(grl_to_plot)[[x]] %>% 
            str_remove_all("(:|Vs. |- )") %>% 
            str_replace_all(" ", "-") %>% 
            str_to_lower() %>% 
            paste0("-aveexpr"),
          class="section level3",
          htmltools::h3(names(grl_to_plot)[[x]]),
          htmltools::div(
            class = "figure", style = "text-align: center",
            htmltools::img(src = fig_link, width = 960),
            htmltools::p(
              class = "caption", htmltools::tags$em(cp)
            )
          )
        )
      )
    },
    mc.cores = threads
  )
)  

AR Up - H3K27ac Up

Highlighted region corresponds to the highest signal within the group AR Up - H3K27ac Up. The most likely target gene is EIF4BP8

AR Up - H3K27ac Down

Highlighted region corresponds to the highest signal within the group AR Up - H3K27ac Down. The most likely target genes are FLOT1, HCG20, IER3, IER3-AS1 and RN7SL353P

AR Up - H3K27ac Unchanged

Highlighted region corresponds to the highest signal within the group AR Up - H3K27ac Unchanged. The most likely target genes are AC092171.1, AC092171.3, AC092171.4, AC092171.5, AC093620.1 and TNRC18

AR Unchanged - H3K27ac Up

Highlighted region corresponds to the highest signal within the group AR Unchanged - H3K27ac Up. The most likely target genes are RARB and TOP2B

AR Unchanged - H3K27ac Down

Highlighted region corresponds to the highest signal within the group AR Unchanged - H3K27ac Down. The most likely target gene is WWOX

Windows With The Largest Change

For the initial set of visualisations, the sets of windows with the strongest signal were selected from within each group. Groups were restricted to those where differential binding was seen in at least comparison.

grl_to_plot <- all_windows %>% 
  filter(
    !str_detect(status, "Un.+Un"),
    vapply(detected, length, integer(1)) > 0
  ) %>% 
  as_tibble() %>% 
  rename_all(
    str_replace_all, pattern = "\\.Vs\\.\\.", replacement = " Vs. "
  ) %>% 
  rename_all(
    str_replace_all, pattern = "\\.\\.", replacement = ": "
  ) %>% 
  nest(
    logFC = ends_with("logFC")
  ) %>% 
  mutate(
    max_logFC = vapply(logFC, function(x) max(abs(unlist(x))), numeric(1))
  ) %>% 
  group_by(status) %>% 
  filter(max_logFC == max(max_logFC)) %>% 
  dplyr::select(
    range, status, ends_with("logFC")
  ) %>% 
  droplevels() %>% 
  split(.$status) %>% 
  lapply(pull, "range") %>% 
  lapply(GRanges, seqinfo = sq) %>% 
  GRangesList()
## Coverage y-limits
gr <- unlist(grl_to_plot)
y_lim <- lapply(
  bwfl,
  function(x) {
    cov <- lapply(x, import.bw, which = gr)
    unlist(lapply(cov, function(y) max(y$score))) %>% 
      c(0) %>% 
      range()
  }
)

## External Coverage (Optional)
if (!is.null(config$external$coverage)) {
  y_ranges <- grl_to_plot %>% 
    unlist() %>% 
    granges() %>% 
    resize(w = 10 * width(.), fix = 'center')
  
  y_lim <- c(
    y_lim[comps],
    bwfl[names(config$external$coverage)] %>% 
      lapply(
        function(x) {
          GRangesList(lapply(x, import.bw, which = y_ranges)) %>% 
            unlist() %>% 
            filter(score == max(score)) %>% 
            mcols() %>% 
            .[["score"]] %>% 
            c(0) %>% 
            range()
        }
      )
  )
  
}
htmltools::tagList(
  mclapply(
    seq_along(grl_to_plot),
    function(x) {
      ## Export the image
      img_out <- file.path(
        fig_path,
        names(grl_to_plot)[[x]] %>% 
          str_remove_all("(:|Vs. |- )") %>% 
          str_replace_all(" ", "_") %>% 
          paste0("_logFC.", fig_type)
      )
      fig_fun(
        filename = img_out, width = 10, height = 8
      )
      ct <- FALSE
      if (length(subsetByOverlaps(all_gr$transcript, grl_to_plot[[x]])) > 10)
        ct <- "meta"
      plotHFGC(
        grl_to_plot[[x]],
        features = feat_gr, featcol = feature_colours,
        featsize = 1 + has_features,
        genes = hfgc_genes, genecol = gene_col,
        coverage = bwfl, linecol = line_col,
        annotation = annot, annotcol = annot_col,
        cytobands = bands_df,
        rotation.title = 90,
        zoom = 10,
        ylim = y_lim,
        collapseTranscripts = ct,
        col.title = "black", background.title = "white", showAxis = FALSE
      )
      dev.off()
      ## Create html tags
      fig_link <- str_extract(img_out, "assets.+")
      gn <- unlist(subsetByOverlaps(all_windows, grl_to_plot[[x]])$detected)
      cp <- htmltools::tags$em(
        glue(
          "
        Highlighted region corresponds to the largest change within the group 
        {names(grl_to_plot)[[x]]}. The most likely target 
        {ifelse(length(gn) == 1, 'gene is ', 'genes are ')}
        {collapseGenes(gn, format = '')}
        "
        )
      )
      htmltools::div(
        htmltools::div(
          id = names(grl_to_plot)[[x]] %>% 
            str_remove_all("(:|Vs. |- )") %>% 
            str_replace_all(" ", "-") %>% 
            str_to_lower() %>% 
            paste0("-logfc"),
          class="section level3",
          htmltools::h3(names(grl_to_plot)[[x]]),
          htmltools::div(
            class = "figure", style = "text-align: center",
            htmltools::img(src = fig_link, width = 960),
            htmltools::p(
              class = "caption", htmltools::tags$em(cp)
            )
          )
        )
      )
    },
    mc.cores = threads
  )
)  

AR Up - H3K27ac Up

Highlighted region corresponds to the largest change within the group AR Up - H3K27ac Up. The most likely target gene is AC078864.2

AR Up - H3K27ac Down

Highlighted region corresponds to the largest change within the group AR Up - H3K27ac Down. The most likely target gene is PTGER3

AR Up - H3K27ac Unchanged

Highlighted region corresponds to the largest change within the group AR Up - H3K27ac Unchanged. The most likely target gene is RN7SL131P

AR Unchanged - H3K27ac Up

Highlighted region corresponds to the largest change within the group AR Unchanged - H3K27ac Up. The most likely target gene is GOLPH3

AR Unchanged - H3K27ac Down

Highlighted region corresponds to the largest change within the group AR Unchanged - H3K27ac Down. The most likely target gene is WWOX

Enrichment Analysis

min_gs_size <- params$enrichment$min_size
if (is.null(min_gs_size) | is.na(min_gs_size)) min_gs_size <- 0
max_gs_size <- params$enrichment$max_size
if (is.null(max_gs_size) | is.na(max_gs_size)) max_gs_size <- Inf
min_sig <- params$enrichment$min_sig
if (is.null(min_sig) | is.na(min_sig)) min_sig <- 1

all_ids <- genes_gr %>%
  mutate(w = width) %>% 
  as_tibble() %>% 
  dplyr::select(gene_id, w) %>% 
  arrange(desc(w)) %>% 
  distinct(gene_id, w) %>% 
  arrange(gene_id) %>% 
  left_join(
    all_windows$gene_id %>%
      unlist() %>%
      table() %>%
      enframe(name = "gene_id", value = "n_windows"),
    by = "gene_id"
  ) %>% 
  dplyr::filter(gene_id %in% genes_gr$gene_id) %>% 
  mutate(
    n_windows = as.integer(ifelse(is.na(n_windows), 0, n_windows))
  )
msigdb <- msigdbr(species = "Homo sapiens") %>% 
  dplyr::filter(
    gs_cat %in% unlist(params$enrichment$msigdb$gs_cat) |
      gs_subcat %in% unlist(params$enrichment$msigdb$gs_subcat),
    str_detect(ensembl_gene, "^E")
  ) %>% 
  dplyr::rename(gene_id = ensembl_gene, gene_name = gene_symbol) %>% # For easier integration 
  dplyr::select(-starts_with("human"), -contains("entrez")) %>% 
  dplyr::filter(gene_id %in% all_ids$gene_id) %>% 
  group_by(gs_name) %>% 
  mutate(n = dplyr::n()) %>% 
  ungroup() %>% 
  dplyr::filter(n >= min_gs_size, n <= max_gs_size) 
gs_by_gsid <- msigdb %>% 
  split(.$gs_name) %>% 
  mclapply(pull, "gene_id", mc.cores = threads) %>% 
  mclapply(unique, mc.cores = threads)
gs_by_geneid <- msigdb %>% 
  split(.$gene_id) %>% 
  mclapply(pull, "gs_name", mc.cores = threads) %>% 
  mclapply(unique, mc.cores = threads)
gs_url <- msigdb %>% 
  distinct(gs_name, gs_url) %>%
  mutate(
    gs_url = ifelse(
      gs_url == "", "
      http://www.gsea-msigdb.org/gsea/msigdb/collections.jsp",
      gs_url
    )
  ) %>% 
  with(setNames(gs_url, gs_name))

min_network_size <- params$networks$min_size
if (is.null(min_network_size) | is.na(min_network_size))
  min_network_size <- 2

max_network_size <- params$networks$max_size
if (is.null(max_network_size) | is.na(max_network_size))
  max_network_size <- Inf

max_network_dist <- params$networks$max_distance
if (is.null(max_network_dist) | is.na(max_network_dist))
  max_network_dist <- 1

net_layout <- params$networks$layout
enrich_alpha <- params$enrichment$alpha
adj_method <- match.arg(params$enrichment$adj, p.adjust.methods)
adj_desc <- case_when(
  p.adjust.methods %in% c("fdr", "BH") ~ "the Benjamini-Hochberg FDR",
  p.adjust.methods %in% c("BY") ~ "the Benjamini-Yekutieli FDR",
  p.adjust.methods %in% c("bonferroni") ~ "the Bonferroni",
  p.adjust.methods %in% c("holm") ~ "Holm's",
  p.adjust.methods %in% c("hommel") ~ "Hommel's",
  p.adjust.methods %in% c("hochberg") ~ "Hochberg's",
  p.adjust.methods %in% c("none") ~ "no"
) %>% 
  setNames(p.adjust.methods)
comp_ids <- comps %>% 
  lapply(
    function(x) {
      subset(all_windows, !str_detect(status, paste(x, "Undetected")))$gene_id
    }
  ) %>% 
  lapply(unlist) %>% 
  lapply(unique) %>% 
  lapply(intersect, genes_gr$gene_id) %>% 
  setNames(comps)
mapped_ids <- comp_ids %>% 
  unlist() %>% 
  unique()
gene_lengths <- structure(width(all_gr$gene), names = all_gr$gene$gene_id)[mapped_ids]
group_ids <- all_windows %>% 
  split(f = .$status) %>% 
  mclapply(
    function(x) unique(unlist(x$gene_id)),
    mc.cores = threads
  ) 

Comparison of Genes Mapped To Targets

The first level of enrichment testing performed was to simply check the genes mapped to a binding site in either comparison, with no consideration being paid to the dynamics of binding in either comparison. This will capture and describe the overall biological activity being jointly regulated.

The genes in each section and intersection in the following Venn Diagram will then be analysed against the complete set of 26,098 genes mapped to either AR or H3K27ac. Firstly, all 26,098 genes mapped to a binding site were tested for enrichment in comparison to the set of 62,400 annotated genes. For this analysis, gene width was used as a potential source of sampling bias and Wallenius’ Non-Central Hypergeomteric was used as implemented in goseq (Young et al. 2010).

The sets of genes mapped to each section of the Venn Diagram were then tested for enrichment in comparison to the larger set of 26,098 mapped genes. The standard hypergeometric distribution was used for these analyses.

If 4 or more gene-sets were considered to be enriched, a network plot was produced using the same methodology as for results from differential binding for individual comparisons.

venn_params <- list(
  area1 = length(comp_ids[[1]]),
  area2 = length(comp_ids[[2]]),
  cross.area = sum(duplicated(unlist(comp_ids))),
  category = comps,
  fill = comp_cols[comps],
  cex = 1.2, cat.cex = 1.2
)
grid.newpage()
do.call("draw.pairwise.venn", venn_params)
*Venn Diagram showing __genes__ mapped to a binding region associated with AR or H3K27ac. Of the 62,400 genes under consideration, a total of 26,098 genes were mapped to a binding region across both comparisons, leaving 36,302 genes unmapped to a binding region across either comparison.*

Venn Diagram showing genes mapped to a binding region associated with AR or H3K27ac. Of the 62,400 genes under consideration, a total of 26,098 genes were mapped to a binding region across both comparisons, leaving 36,302 genes unmapped to a binding region across either comparison.

Result Tables

Mapped Genes To Either AR or H3K27ac

pwf_mapped <- all_gr$gene %>%
  mutate(w = width) %>%
  as_tibble() %>% 
  arrange(gene_id, desc(w)) %>%
  distinct(gene_id, w) %>% 
  mutate(mapped = gene_id %in% mapped_ids) %>% 
  with(nullp(setNames(mapped, gene_id), bias.data = log10(w)))
goseq_mapped_res <- pwf_mapped %>% 
  goseq(gene2cat = gs_by_geneid) %>% 
  dplyr::filter(numDEInCat > 0) %>% 
  as_tibble() %>% 
  dplyr::select(
    gs_name = category, pval = over_represented_pvalue, starts_with("num")
  ) %>% 
  mutate(
    adj_p = p.adjust(pval, adj_method),
    enriched = adj_p < enrich_alpha
  ) %>% 
  dplyr::filter(numDEInCat >= min_sig) %>% 
  left_join(
    dplyr::filter(msigdb, gene_id %in% mapped_ids)[c("gs_name", "gene_name", "gs_url", "gs_description")],
    by = "gs_name"
  ) %>% 
  chop(gene_name) %>% 
  mutate(
    gene_name = vapply(gene_name, paste, character(1), collapse = "; ")
  )
any_goseq_mapped <- sum(goseq_mapped_res$adj_p < enrich_alpha) > 0
tg_mapped <- make_tbl_graph(
  goseq_mapped_res, 
  gs = gs_by_gsid %>% 
    lapply(intersect, rownames(subset(pwf_mapped, DEgenes))) %>% 
    .[vapply(., length, integer(1)) > 0]
)
plot_network_mapped <- length(tg_mapped) >= min_network_size
tbl <- goseq_mapped_res %>%
  dplyr::filter(enriched) %>% 
  dplyr::select(
    gs_name, gs_description, pval, adj_p, starts_with("num"), gene_name
  ) %>% 
  reactable(
    filterable = TRUE,
    showPageSizeOptions = TRUE,
    columns = list2(
      gs_name = colDef(
        name = "Gene Set",
        cell = function(value) htmltools::tags$a(
          href = gs_url[[value]], 
          target = "_blank", 
          str_replace_all(value, "_", " ")
        ),
        html = TRUE,
        maxWidth = 150
      ),
      gs_description = colDef(
        name = "Description",
        cell = function(value) str_trunc(value, width = 150)
      ),
      pval = colDef(
        name = "P-Value",
        cell = function(value) ifelse(
          value < 0.001,
          sprintf("%.2e", value),
          sprintf("%.3f", value)
        ),
        maxWidth = 80
      ),
      adj_p = colDef(
        name = "p<sub>adj</sub>", html = TRUE,
        cell = function(value) ifelse(
          value < 0.001,
          sprintf("%.2e", value),
          sprintf("%.3f", value)
        ),
        maxWidth = 80
      ),
      numDEInCat = colDef(
        name = "Mapped Genes",
        cell = function(value) comma(value, 1),
        maxWidth = 80
      ),
      numInCat = colDef(
        name = "Genes in Gene-Set",
        cell = function(value) comma(value, 1),
        maxWidth = 80
      ),
      gene_name = colDef(
        name = "Gene Names",
        cell = function(value) with_tooltip(value, width = 150)
      )
    )
  )
cp <- glue(
  "
  All {comma(length(mapped_ids))} genes with a mapped binding window from 
  either comparison were compared to the complete set of 
  {comma(length(unique(all_gr$gene$gene_id)))} annotated genes, in order to 
  ascertain which key gene-sets and pathways were likely to be targeted in either analysis. 
  ",
  ifelse(
    has_rnaseq,
    "Genes were restricted to those in the set of detected genes from the provided RNA-Seq dataset. ", 
    ""
  ),
  "
  Gene-length was included as the source of any potential sampling bias for a 
  gene being mapped to any target-bound range. This analysis provides an overall 
  viewpoint as to which pathways are regulated across either dataset.
  "
)
div(class = "table",
    div(class = "table-header", div(class = "caption", cp)),
    tbl
)
All 26,098 genes with a mapped binding window from either comparison were compared to the complete set of 62,400 annotated genes, in order to ascertain which key gene-sets and pathways were likely to be targeted in either analysis. Gene-length was included as the source of any potential sampling bias for a gene being mapped to any target-bound range. This analysis provides an overall viewpoint as to which pathways are regulated across either dataset.

Genes Mapped to Both AR and H3K27ac

pwf_t1t2 <- nullp(
  vec_t1t2,
  bias.data = log10(setNames(all_ids$w, all_ids$gene_id)[mapped_ids]), 
  plot.fit = TRUE
) 
if (sum(pwf_t1t2$DEgenes) > 0) {
  goseq_t1t2_res <- pwf_t1t2 %>% 
    goseq(gene2cat = gs_by_geneid, method = gs_method) %>% 
    dplyr::filter(numDEInCat > 0) %>% 
    as_tibble() %>% 
    dplyr::select(
      gs_name = category, pval = over_represented_pvalue, starts_with("num")
    ) %>% 
    mutate(
      adj_p = p.adjust(pval, adj_method),
      enriched = adj_p < enrich_alpha
    ) %>% 
    dplyr::filter(numDEInCat >= min_sig) %>% 
    left_join(
      dplyr::filter(msigdb, gene_id %in% mapped_ids)[c("gs_name", "gene_name", "gs_url", "gs_description")],
      by = "gs_name"
    ) %>% 
    chop(gene_name) %>% 
    mutate(gene_name = vapply(gene_name, paste, character(1), collapse = "; "))
}
any_goseq_t1t2 <- sum(goseq_t1t2_res$adj_p < enrich_alpha) > 0
tg_t1t2 <- make_tbl_graph(
  goseq_t1t2_res, 
  gs = gs_by_gsid %>% 
    lapply(intersect, rownames(subset(pwf_t1t2, DEgenes))) %>% 
    .[vapply(., length, integer(1)) > 0]
)
plot_network_t1t2 <- length(tg_t1t2) >= min_network_size
if (!any_goseq_t1t2) para <- glue(para, " No enrichment was found.")
tbl <- goseq_t1t2_res %>%
  dplyr::filter(enriched) %>% 
  dplyr::select(
    gs_name, gs_description, pval, adj_p, starts_with("num"), gene_name
  ) %>% 
  reactable(
    filterable = TRUE,
    showPageSizeOptions = TRUE,
    columns = list2(
      gs_name = colDef(
        name = "Gene Set",
        cell = function(value) htmltools::tags$a(
          href = gs_url[[value]], 
          target = "_blank", 
          str_replace_all(value, "_", " ")
        ),
        html = TRUE,
        maxWidth = 150
      ),
      gs_description = colDef(
        name = "Description",
        cell = function(value) str_trunc(value, width = 150)
      ),
      pval = colDef(
        name = "P-Value",
        cell = function(value) ifelse(
          value < 0.001,
          sprintf("%.2e", value),
          sprintf("%.3f", value)
        ),
        maxWidth = 80
      ),
      adj_p = colDef(
        name = "p<sub>adj</sub>", html = TRUE,
        cell = function(value) ifelse(
          value < 0.001,
          sprintf("%.2e", value),
          sprintf("%.3f", value)
        ),
        maxWidth = 80
      ),
      numDEInCat = colDef(
        name = "Genes Mapped to Both",
        cell = function(value) comma(value, 1),
        maxWidth = 80
      ),
      numInCat = colDef(
        name = "Genes Mapped to Either",
        cell = function(value) comma(value, 1),
        maxWidth = 80
      ),
      gene_name = colDef(
        name = "Gene Names",
        cell = function(value) with_tooltip(value, width = 150)
      )
    )
  )
div(class = "table",
    div(class = "table-header", div(class = "caption", para)),
    tbl
)
All 7,623 genes with a mapped binding window to both AR and H3K27ac were compared to the 26,098 genes mapped to either, in order to ascertain whether any key gene-sets were likely to be targeted by both factors in combination. No sampling bias offset was incorporated, instead using a simple Hypergeometric model for enrichment testing.

Genes Mapped to AR Only

goseq_t1only_res <- tibble(
  gs_name = character(), mapped = numeric(), enriched = logical(),
  adj_p = numeric()
)
vec_t1only <- structure(
  mapped_ids %in% setdiff(comp_ids[[1]], comp_ids[[2]]),
  names = mapped_ids
)
gs_method <- "Hypergeometric"
any_goseq_t1only <- plot_network_t1only <- FALSE
para <- glue(
  "
  All {comma(length(setdiff(comp_ids[[1]], comp_ids[[2]])))} genes with a 
  mapped binding window to only {comps[[1]]} were compared to 
  the {comma(length(mapped_ids))} genes mapped to either
  {glue_collapse(comps, last = ' or ')}, in order to ascertain whether any gene-sets 
  were likely to be targeted exclusively. No sampling bias offset was 
  incorporated, instead using a simple Hypergeometric model for enrichment testing.
  "
)
pwf_t1only <- nullp(
  vec_t1only,
  bias.data = log10(setNames(all_ids$w, all_ids$gene_id)[mapped_ids]), 
  plot.fit = TRUE
) 
if (sum(pwf_t1only$DEgenes) > 0) {
  goseq_t1only_res <- pwf_t1only %>% 
    goseq(gene2cat = gs_by_geneid, method = gs_method) %>% 
    dplyr::filter(numDEInCat > 0) %>% 
    as_tibble() %>% 
    dplyr::select(
      gs_name = category, pval = over_represented_pvalue, starts_with("num")
    ) %>% 
    mutate(
      adj_p = p.adjust(pval, adj_method),
      enriched = adj_p < enrich_alpha
    ) %>% 
    dplyr::filter(numDEInCat >= min_sig) %>% 
    left_join(
      dplyr::filter(msigdb, gene_id %in% mapped_ids)[c("gs_name", "gene_name", "gs_url", "gs_description")],
      by = "gs_name"
    ) %>% 
    chop(gene_name) %>% 
    mutate(gene_name = vapply(gene_name, paste, character(1), collapse = "; "))
}
any_goseq_t1only <- sum(goseq_t1only_res$adj_p < enrich_alpha) > 0
tg_t1only <- make_tbl_graph(
  goseq_t1only_res, 
  gs = gs_by_gsid %>% 
    lapply(intersect, rownames(subset(pwf_t1only, DEgenes))) %>% 
    .[vapply(., length, integer(1)) > 0]
)
plot_network_t1only <- length(tg_t1only) >= min_network_size
if (!any_goseq_t1only) para <- glue(para, " No enrichment was found.")
tbl <- goseq_t1only_res %>%
  dplyr::filter(enriched) %>% 
  dplyr::select(
    gs_name, gs_description, pval, adj_p, starts_with("num"), gene_name
  ) %>% 
  reactable(
    filterable = TRUE,
    showPageSizeOptions = TRUE,
    columns = list2(
      gs_name = colDef(
        name = "Gene Set",
        cell = function(value) htmltools::tags$a(
          href = gs_url[[value]], 
          target = "_blank", 
          str_replace_all(value, "_", " ")
        ),
        html = TRUE,
        maxWidth = 150
      ),
      gs_description = colDef(
        name = "Description",
        cell = function(value) str_trunc(value, width = 150)
      ),
      pval = colDef(
        name = "P-Value",
        cell = function(value) ifelse(
          value < 0.001,
          sprintf("%.2e", value),
          sprintf("%.3f", value)
        ),
        maxWidth = 80
      ),
      adj_p = colDef(
        name = "p<sub>adj</sub>", html = TRUE,
        cell = function(value) ifelse(
          value < 0.001,
          sprintf("%.2e", value),
          sprintf("%.3f", value)
        ),
        maxWidth = 80
      ),
      numDEInCat = colDef(
        name = glue("Genes Mapped to {comps[[1]]} Only"),
        cell = function(value) comma(value, 1),
        maxWidth = 80
      ),
      numInCat = colDef(
        name = "Genes Mapped to Both",
        cell = function(value) comma(value, 1),
        maxWidth = 80
      ),
      gene_name = colDef(
        name = "Gene Names",
        cell = function(value) with_tooltip(value, width = 150)
      )
    )
  )
div(class = "table",
    div(class = "table-header", div(class = "caption", para)),
    tbl
)
All 2,459 genes with a mapped binding window to only AR were compared to the 26,098 genes mapped to either AR or H3K27ac, in order to ascertain whether any gene-sets were likely to be targeted exclusively. No sampling bias offset was incorporated, instead using a simple Hypergeometric model for enrichment testing.

Genes Mapped to H3K27ac Only

goseq_t2only_res <- tibble(
  gs_name = character(), mapped = numeric(), enriched = logical(),
  adj_p = numeric()
)
vec_t2only <- structure(
  mapped_ids %in% setdiff(comp_ids[[2]], comp_ids[[1]]),
  names = mapped_ids
)
gs_method <- "Hypergeometric"
para <- glue(
  "
  All {comma(length(setdiff(comp_ids[[2]], comp_ids[[1]])))} genes with a 
  mapped binding window to only {comps[[2]]} were compared to 
  the {comma(length(mapped_ids))} genes mapped to either 
  {glue_collapse(comps, last = ' or ')}, in order to ascertain whether any gene-sets 
  were likely to be targeted exclusively. No sampling bias offset was 
  incorporated, instead using a simple Hypergeometric model for enrichment testing.
  "
)
any_goseq_t2only <- plot_network_t2only <- FALSE
pwf_t2only <- nullp(
  vec_t2only,
  bias.data = log10(setNames(all_ids$w, all_ids$gene_id)[mapped_ids]), 
  plot.fit = TRUE
) 
if (sum(pwf_t2only$DEgenes) > 0) {
  goseq_t2only_res <- pwf_t2only %>% 
    goseq(gene2cat = gs_by_geneid, method = gs_method) %>% 
    dplyr::filter(numDEInCat > 0) %>% 
    as_tibble() %>% 
    dplyr::select(
      gs_name = category, pval = over_represented_pvalue, starts_with("num")
    ) %>% 
    mutate(
      adj_p = p.adjust(pval, adj_method),
      enriched = adj_p < enrich_alpha
    ) %>% 
    dplyr::filter(numDEInCat >= min_sig) %>% 
    left_join(
      dplyr::filter(msigdb, gene_id %in% mapped_ids)[c("gs_name", "gene_name", "gs_url", "gs_description")],
      by = "gs_name"
    ) %>% 
    chop(gene_name) %>% 
    mutate(gene_name = vapply(gene_name, paste, character(1), collapse = "; "))
}
any_goseq_t2only <- sum(goseq_t2only_res$adj_p < enrich_alpha) > 0
tg_t2only <- make_tbl_graph(
  goseq_t2only_res, 
  gs = gs_by_gsid %>% 
    lapply(intersect, rownames(subset(pwf_t2only, DEgenes))) %>% 
    .[vapply(., length, integer(1)) > 0]
)
plot_network_t2only <- length(tg_t2only) >= min_network_size
if (!any_goseq_t2only) para <- glue(para, " No enrichment was found.")
tbl <- goseq_t2only_res %>%
  dplyr::filter(enriched) %>% 
  dplyr::select(
    gs_name, gs_description, pval, adj_p, starts_with("num"), gene_name
  ) %>% 
  reactable(
    filterable = TRUE,
    showPageSizeOptions = TRUE,
    columns = list2(
      gs_name = colDef(
        name = "Gene Set",
        cell = function(value) htmltools::tags$a(
          href = gs_url[[value]], 
          target = "_blank", 
          str_replace_all(value, "_", " ")
        ),
        html = TRUE,
        maxWidth = 150
      ),
      gs_description = colDef(
        name = "Description",
        cell = function(value) str_trunc(value, width = 150)
      ),
      pval = colDef(
        name = "P-Value",
        cell = function(value) ifelse(
          value < 0.001,
          sprintf("%.2e", value),
          sprintf("%.3f", value)
        ),
        maxWidth = 80
      ),
      adj_p = colDef(
        name = "p<sub>adj</sub>", html = TRUE,
        cell = function(value) ifelse(
          value < 0.001,
          sprintf("%.2e", value),
          sprintf("%.3f", value)
        ),
        maxWidth = 80
      ),
      numDEInCat = colDef(
        name = glue("Genes Mapped to {comps[[2]]} Only"),
        cell = function(value) comma(value, 1),
        maxWidth = 80
      ),
      numInCat = colDef(
        name = "Genes Mapped to Both",
        cell = function(value) comma(value, 1),
        maxWidth = 80
      ),
      gene_name = colDef(
        name = "Gene Names",
        cell = function(value) with_tooltip(value, width = 150)
      )
    )
  )
div(class = "table",
    div(class = "table-header", div(class = "caption", para)),
    tbl
)
All 16,016 genes with a mapped binding window to only H3K27ac were compared to the 26,098 genes mapped to either AR or H3K27ac, in order to ascertain whether any gene-sets were likely to be targeted exclusively. No sampling bias offset was incorporated, instead using a simple Hypergeometric model for enrichment testing.

Network Plots

All Mapped Genes

tg_mapped %>% 
  ggraph(layout = net_layout, weights = oc^2) +
  geom_edge_link(aes(width = oc^2, alpha = oc^2)) +
  geom_node_point(
    aes(fill = -log10(pval), size = numDEInCat),
    shape = 21
  ) +
  geom_node_text(
    aes(label = label),
    colour = "black", size = 3, 
    data = . %>%
      mutate(
        label = str_replace_all(label, "_", " ") %>% str_trunc(60) %>% str_wrap(width = 18)
      ),
    repel = TRUE, max.overlaps = max(10, round(length(tg_mapped) / 4, 0)),
    bg.color = "white", bg.r = 0.1, 
  ) +
  scale_x_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_y_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_fill_viridis_c(option = "inferno", begin = 0.25) +
  scale_size_continuous(range = c(1, 10)) +
  scale_edge_width(range = c(1, 6), limits = c(0, 1)) +
  scale_edge_alpha(range = c(0.1, 0.4), limits = c(0, 1)) +
  guides(edge_alpha= "none") +
  labs(size = "Mapped Genes", edge_width = expr(paste(OC^2))) +
  theme_void() 
*Network plot showing  gene-sets enriched amongst the overall set of sites with a binding site from __either__ comparison.*

Network plot showing gene-sets enriched amongst the overall set of sites with a binding site from either comparison.

Genes Mapped to Both AR and H3K27ac

tg_t1t2 %>% 
  ggraph(layout = net_layout, weights = oc^2) +
  geom_edge_link(aes(width = oc^2, alpha = oc^2)) +
  geom_node_point(
    aes(fill = -log10(pval), size = numDEInCat),
    shape = 21
  ) +
  geom_node_text(
    aes(label = label),
    colour = "black", size = 3, 
    data = . %>%
      mutate(
        label = str_replace_all(label, "_", " ") %>% str_trunc(60) %>% str_wrap(width = 18)
      ),
    repel = TRUE, max.overlaps = max(10, round(length(tg_t1t2) / 4, 0)),
    bg.color = "white", bg.r = 0.1, 
  ) +
  scale_x_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_y_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_fill_viridis_c(option = "inferno", begin = 0.25) +
  scale_size_continuous(range = c(1, 10)) +
  scale_edge_width(range = c(1, 6), limits = c(0, 1)) +
  scale_edge_alpha(range = c(0.1, 0.4), limits = c(0, 1)) +
  guides(edge_alpha= "none") +
  labs(size = "Mapped Genes", edge_width = expr(paste(OC^2))) +
  theme_void() 
*Network plot showing gene-sets enriched amongst the overall set of sites with a binding site for __both__ AR and H3K27ac.*

Network plot showing gene-sets enriched amongst the overall set of sites with a binding site for both AR and H3K27ac.

Genes Mapped to AR Only

tg_t1only %>% 
  ggraph(layout = net_layout, weights = oc^2) +
  geom_edge_link(aes(width = oc^2, alpha = oc^2)) +
  geom_node_point(
    aes(fill = -log10(pval), size = numDEInCat),
    shape = 21
  ) +
  geom_node_text(
    aes(label = label),
    colour = "black", size = 3, 
    data = . %>%
      mutate(
        label = str_replace_all(label, "_", " ") %>% str_trunc(60) %>% str_wrap(width = 18)
      ),
    repel = TRUE, max.overlaps = max(10, round(length(tg_t1only) / 4, 0)),
    bg.color = "white", bg.r = 0.1, 
  ) +
  scale_x_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_y_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_fill_viridis_c(option = "inferno", begin = 0.25) +
  scale_size_continuous(range = c(1, 10)) +
  scale_edge_width(range = c(1, 6), limits = c(0, 1)) +
  scale_edge_alpha(range = c(0.1, 0.4), limits = c(0, 1)) +
  guides(edge_alpha= "none") +
  labs(size = "Mapped Genes", edge_width = expr(paste(OC^2))) +
  theme_void() 
*Network plot showing all gene-sets enriched amongst the overall set of sites with a binding site for __AR only__.*

Network plot showing all gene-sets enriched amongst the overall set of sites with a binding site for AR only.

Genes Mapped to H3K27ac Only

tg_t2only %>% 
  ggraph(layout = net_layout, weights = oc^2) +
  geom_edge_link(aes(width = oc^2, alpha = oc^2)) +
  geom_node_point(
    aes(fill = -log10(pval), size = numDEInCat),
    shape = 21
  ) +
  geom_node_text(
    aes(label = label),
    colour = "black", size = 3, 
    data = . %>%
      mutate(
        label = str_replace_all(label, "_", " ") %>% str_trunc(60) %>% str_wrap(width = 18)
      ),
    repel = TRUE, max.overlaps = max(10, round(length(tg_t2only) / 4, 0)),
    bg.color = "white", bg.r = 0.1, 
  ) +
  scale_x_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_y_continuous(expand = expansion(c(0.1, 0.1))) +
  scale_fill_viridis_c(option = "inferno", begin = 0.25) +
  scale_size_continuous(range = c(1, 10)) +
  scale_edge_width(range = c(1, 6), limits = c(0, 1)) +
  scale_edge_alpha(range = c(0.1, 0.4), limits = c(0, 1)) +
  guides(edge_alpha= "none") +
  labs(size = "Mapped Genes", edge_width = expr(paste(OC^2))) +
  theme_void() 
*Network plot showing all gene-sets enriched amongst the overall set of sites with a binding site for __H3K27ac only__.*

Network plot showing all gene-sets enriched amongst the overall set of sites with a binding site for H3K27ac only.

Comparison of Differentially Bound Windows

n_peaks <- structure(all_ids$n_windows, names = all_ids$gene_id)[mapped_ids]
goseq_group_res <- group_ids %>% 
  .[str_detect(names(.), "Up|Down")] %>% 
  mclapply(
    function(x) {
      vec <- structure(mapped_ids %in% x, names = mapped_ids)
      pwf <- nullp(vec, bias.data = n_peaks, plot.fit = FALSE)
      res <- tibble(adj_p = numeric())
      gs_method <- "Hypergeometric"
      if (sum(pwf$DEgenes) > 0) {
        res <- goseq(pwf, gene2cat = gs_by_geneid, method = gs_method) %>% 
          as_tibble() %>% 
          dplyr::filter(numDEInCat > 0) %>% 
          mutate(adj_p = p.adjust(over_represented_pvalue, adj_method)) %>% 
          dplyr::filter(numDEInCat >= min_sig) %>% 
          dplyr::select(
            gs_name = category, pval = over_represented_pvalue, adj_p, 
            ends_with("InCat")
          ) %>% 
          left_join(
            dplyr::filter(msigdb, gene_id %in% x)[c("gs_name", "gene_name", "gs_url", "gs_description")],
            by = "gs_name"
          ) %>% 
          chop(gene_name) %>% 
          mutate(
            gene_name = vapply(gene_name, paste, character(1), collapse = "; ")
          )
      }
      res
    },
    mc.cores = threads
  ) %>% 
  ## This step at the end basically removes any groups with no results
  lapply(list) %>% 
  as_tibble() %>% 
  pivot_longer(everything(), names_to = "group") %>% 
  unnest(everything()) %>% 
  split(.$group)
goseq_group_sig <- goseq_group_res %>% 
  lapply(dplyr::filter, adj_p < enrich_alpha) %>% 
  .[vapply(., nrow, integer(1)) > 0]
tg_group_res <- names(goseq_group_sig) %>% 
  lapply(
    function(x) {
      make_tbl_graph(
        goseq_group_sig[[x]],
        gs_by_gsid[goseq_group_sig[[x]]$gs_name] %>%
          lapply(intersect, group_ids[[x]])
      )
    }
  ) %>% 
  setNames(names(goseq_group_sig)) %>% 
  .[vapply(., length, integer(1)) >= min_network_size]

The same MSigDB gene sets were used for associating any pathways with the combined changes in both AR and H3K27ac. The ‘universe’ of genes was defined as all 26,098 gene ids with a binding window mapped from either target. No sampling bias was included, with test results using a simple Hypergeometric Distribution.

Results

htmltools::tagList(
  names(goseq_group_sig) %>% 
    mclapply(
      function(x) {
        cp <- glue(
          "
          All {comma(length(group_ids[[x]]), 1)} genes mapped to binding windows
           with the pattern {x} were compared to the complete set of 
          {comma(length(mapped_ids), 1)} genes mapped to at least one binding
          window. {nrow(goseq_group_sig[[x]])} gene-sets were considered as 
          enriched using a threshold of {enrich_alpha} for adjusted-p-values.
          "
        )
        htmltools::div(
          htmltools::div(
            id = x %>% 
              str_to_lower %>% 
              str_replace_all(" ", "-") %>% 
              str_replace_all("-+", "-"),
            class="section level4",
            htmltools::h4(class = "tabset", x),
            htmltools::tags$em(cp),
            goseq_group_sig[[x]] %>% 
              separate(group, into = comps, sep = " - ") %>% 
              dplyr::select(
                all_of(comps), gs_name, gs_description, pval, adj_p, everything()
              ) %>% 
              reactable(
                filterable = TRUE,
                showPageSizeOptions = TRUE,
                columns = list2(
                  "{comps[[1]]}" := colDef(
                    cell = function(value) {
                      value <- str_extract(value, "Up|Down|Unchanged|Undetected")
                      if (str_detect(value, "Up")) value <- paste(value, "\u21E7")
                      if (str_detect(value, "Down")) value <- paste(value, "\u21E9")
                      paste(targets[[1]], value)
                    },
                    style = function(value) {
                      colour <- case_when(
                        str_detect(value, "Up") ~ colours$direction[["up"]],
                        str_detect(value, "Down") ~ colours$direction[["down"]],
                        str_detect(value, "Unchanged") ~ colours$direction[["unchanged"]],
                        str_detect(value, "Undetected") ~ colours$direction[["undetected"]]
                      )
                      list(color = colour)
                    },
                    maxWidth = 120
                  ),
                  "{comps[[2]]}" := colDef(
                    cell = function(value) {
                      value <- str_extract(value, "Up|Down|Unchanged|Undetected")
                      if (str_detect(value, "Up")) value <- paste(value, "\u21E7")
                      if (str_detect(value, "Down")) value <- paste(value, "\u21E9")
                      paste(targets[[2]], value)
                    },
                    style = function(value) {
                      colour <- case_when(
                        str_detect(value, "Up") ~ colours$direction[["up"]],
                        str_detect(value, "Down") ~ colours$direction[["down"]],
                        str_detect(value, "Unchanged") ~ colours$direction[["unchanged"]],
                        str_detect(value, "Undetected") ~ colours$direction[["undetected"]]
                      )
                      list(color = colour)
                    },
                    maxWidth = 120
                  ),
                  gs_name = colDef(
                    name = "Gene Set",
                    cell = function(value) {
                      htmltools::tags$a(
                        href = gs_url[[value]], target = "_blank",
                        str_replace_all(value, "_", " ")
                      )
                    },
                    html = TRUE,
                    maxWidth = 150
                  ),
                  gs_description = colDef(
                    name = "Description",
                    cell = function(value) str_trunc(value, width = 150)
                  ),
                  gs_url = colDef(show = FALSE),
                  pval = colDef(
                    name = "P-Value",
                    cell = function(value) ifelse(
                      value < 0.001,
                      sprintf("%.2e", value),
                      sprintf("%.3f", value)
                    ),
                    maxWidth = 80
                  ),
                  adj_p = colDef(
                    name = "p<sub>adj</sub>", html = TRUE,
                    cell = function(value) ifelse(
                      value < 0.001,
                      sprintf("%.2e", value),
                      sprintf("%.3f", value)
                    ),
                    maxWidth = 80
                  ),
                  numDEInCat = colDef(
                    name = "Mapped Genes",
                    cell = function(value) comma(value, 1),
                    maxWidth = 80
                  ),
                  numInCat = colDef(
                    name = "Genes in Gene-Set",
                    cell = function(value) comma(value, 1),
                    maxWidth = 80
                  ),
                  gene_name = colDef(
                    name = "Gene Names",
                    cell = function(value) with_tooltip(value, width = 150)
                  )
                )
              )
          )
        )
      },
      mc.cores = threads
    )
)

AR Up - H3K27ac Down

All 13 genes mapped to binding windows with the pattern AR Up - H3K27ac Down were compared to the complete set of 26,098 genes mapped to at least one binding window. 2 gene-sets were considered as enriched using a threshold of 0.05 for adjusted-p-values.

AR Up - H3K27ac Unchanged

All 3,354 genes mapped to binding windows with the pattern AR Up - H3K27ac Unchanged were compared to the complete set of 26,098 genes mapped to at least one binding window. 37 gene-sets were considered as enriched using a threshold of 0.05 for adjusted-p-values.

AR Up - H3K27ac Undetected

All 5,207 genes mapped to binding windows with the pattern AR Up - H3K27ac Undetected were compared to the complete set of 26,098 genes mapped to at least one binding window. 86 gene-sets were considered as enriched using a threshold of 0.05 for adjusted-p-values.

AR Up - H3K27ac Up

All 445 genes mapped to binding windows with the pattern AR Up - H3K27ac Up were compared to the complete set of 26,098 genes mapped to at least one binding window. 1 gene-sets were considered as enriched using a threshold of 0.05 for adjusted-p-values.

All Groups

goseq_group_sig %>%  
  bind_rows() %>% 
  arrange(pval) %>% 
  left_join(n_mapped, by = c("group" = "status")) %>% 
  mutate(
    gs_name = fct_inorder(gs_name) %>% 
      fct_relabel(str_replace_all, "_", " ") %>% 
      fct_relabel(str_trunc, width = 60),
    prop = numDEInCat / mapped,
    group = factor(group, levels = n_mapped$status)
  ) %>%
  droplevels() %>% 
  ggplot(aes(group, fct_rev(gs_name))) + 
  geom_point(aes(size = prop, fill = -log10(pval)), shape = 21) +
  geom_ysidecol(
    aes(x = numInCat), data = . %>% distinct(gs_name, numInCat),
  ) +
  geom_xsidecol(
    aes(y = mapped),
    data = n_mapped %>% 
      dplyr::filter(status %in% names(goseq_group_sig)) %>% 
      dplyr::rename(group = status),
    width = 0.5
  ) +
  scale_x_discrete(labels = label_wrap(10)) +
  scale_ysidex_continuous(expand = expansion(c(0, 0.15))) +
  scale_xsidey_continuous(
    expand = expansion(c(0, 0.15))
  ) +
  scale_fill_viridis_c(option = "inferno", begin = 0.2) +
  scale_size(range = c(0, 5), labels = percent) + 
  labs(
    x = c(), y = c(), size = "% Mapped\nGenes", fill = expr(paste(-log[10], "p"))
  ) +
  theme(
    panel.grid = element_blank(), 
    axis.text = element_text(size = 8),
    legend.position = "right",
    ggside.panel.scale.x = 0.2,
    ggside.panel.scale.y = 0.3,
    ggside.axis.text.x.bottom = element_text(angle = 270, hjust = 0, vjust = 0.5)
  ) 
*Combined enrichment across all groups, showing only significant results for enrichment. The top panel shows how many genes were mapped to sites for each group, whilst the right panel shows gene set size. Point sizes indicate the proportion of mapped genes which are from each pathway.*

Combined enrichment across all groups, showing only significant results for enrichment. The top panel shows how many genes were mapped to sites for each group, whilst the right panel shows gene set size. Point sizes indicate the proportion of mapped genes which are from each pathway.

Network Plots

htmltools::tagList(
  names(tg_group_res) %>% 
    mclapply(
      function(x) {
        ## Export the image
        img_out <- file.path(
          fig_path,
          x %>% 
            str_replace_all(" ", "_") %>% 
            str_replace_all("_-_", "-") %>% 
            paste0("_network.", fig_type)
        )
        fig_fun(
          filename = img_out,
          width = knitr::opts_current$get("fig.width"),
          height = knitr::opts_current$get("fig.height")
        )
        p <- tg_group_res[[x]] %>%
          ggraph(layout = net_layout, weights =oc^2) +
          geom_edge_link(aes(width = oc^2, alpha = oc^2)) +
          geom_node_point(
            aes(fill = -log10(pval), size = numDEInCat),
            shape = 21
          ) +
          geom_node_text(
            aes(label = label),
            colour = "black", size = 3, 
            data = . %>%
              mutate(
                label = str_replace_all(label, "_", " ") %>% str_trunc(60) %>% str_wrap(width = 18)
              ),
            repel = TRUE,
            max.overlaps = max(10, round(length(tg_group_res[[x]]) / 4, 0)),
            bg.color = "white", bg.r = 0.1
          ) +
          scale_x_continuous(expand = expansion(c(0.1, 0.1))) +
          scale_y_continuous(expand = expansion(c(0.1, 0.1))) +
          scale_fill_viridis_c(option = "inferno", begin = 0.25) +
          scale_size_continuous(range = c(1, 10)) +
          scale_edge_width(range = c(1, 6), limits = c(0, 1)) +
          scale_edge_alpha(range = c(0.1, 0.4), limits = c(0, 1)) +
          guides(edge_alpha= "none") +
          labs(size = "Mapped Genes", edge_width = expr(paste(OC^2))) +
          theme_void()
        print(p)
        dev.off()
        ## Create html tags
        fig_link <- str_extract(img_out, "assets.+")
        cp <- htmltools::tags$em(
          glue(
            "
            Network plot showing enriched pathways mapped to genes 
            associated with {x}. 
            "
          )
        )
        htmltools::div(
          htmltools::div(
            id = img_out %>%
              basename() %>%
              str_remove_all(glue(".{fig_type}$")) %>%
              str_to_lower() %>%
              str_replace_all("_", "-"),
            class="section level4",
            htmltools::h4(x),
            htmltools::div(
              class = "figure", style = "text-align: center",
              htmltools::img(src = fig_link, width = "100%"),
              htmltools::tags$caption(cp)
            )
          )
        )
      },
      mc.cores = threads
    )
)

AR Up - H3K27ac Unchanged

Network plot showing enriched pathways mapped to genes associated with AR Up - H3K27ac Unchanged.

AR Up - H3K27ac Undetected

Network plot showing enriched pathways mapped to genes associated with AR Up - H3K27ac Undetected.

Data Export

write_rds(all_windows, all_out$results, compress = "gz")
save.image(all_out$renv)
all_windows %>% 
  as_tibble() %>% 
  dplyr::select(-detected) %>% 
  unnest(all_of("gene_id")) %>% # This ensures correct id2gene mappings
  mutate(gene_name = id2gene[gene_id]) %>% 
  dplyr::select(
    starts_with("gene"), range, any_of(c("region", "feature", "hic")),
  ) %>% 
  write_csv(
    gzfile(all_out$csv)
  )
write_csv(de_genes_both_comps, all_out$de_genes)
goseq_group_sig %>% 
  bind_rows() %>% 
  write_csv(all_out$goseq)
cmn_res %>% 
  bind_rows() %>% 
  mutate(
    leadingEdge = vapply(leadingEdge, paste, character(1), collapse = "; "),
    gs_url = gs_url[gs_name]
  ) %>% 
  write_csv(all_out$rna_enrichment)

During this workflow, the following files were exported:

  • csv: output/pairwise_comparisons/AR_H3K27ac/AR_E2_E2DHT-H3K27ac_E2_E2DHT-pairwise_comparison.csv.gz
  • de_genes: output/pairwise_comparisons/AR_H3K27ac/AR_E2_E2DHT-H3K27ac_E2_E2DHT-de_genes.csv
  • goseq: output/pairwise_comparisons/AR_H3K27ac/AR_E2_E2DHT-H3K27ac_E2_E2DHT-enrichment.csv
  • results: output/pairwise_comparisons/AR_H3K27ac/AR_E2_E2DHT-H3K27ac_E2_E2DHT-all_windows.rds
  • rna_enrichment: output/pairwise_comparisons/AR_H3K27ac/AR_E2_E2DHT-H3K27ac_E2_E2DHT-rnaseq_enrichment.csv
  • renv: output/envs/AR_E2_E2DHT-H3K27ac_E2_E2DHT-pairwise_comparison.RData

References

Subramanian, Aravind, Pablo Tamayo, Vamsi K. Mootha, Sayan Mukherjee, Benjamin L. Ebert, Michael A. Gillette, Amanda Paulovich, et al. 2005. “Gene Set Enrichment Analysis: A Knowledge-Based Approach for Interpreting Genome-Wide Expression Profiles.” Proceedings of the National Academy of Sciences 102 (43): 15545–50. https://doi.org/10.1073/pnas.0506580102.
Young, M. D., M. J. Wakefield, G. K. Smyth, and A. Oshlack. 2010. Gene ontology analysis for RNA-seq: accounting for selection bias.” Genome Biol 11 (2): R14.


R version 4.2.2 (2022-10-31)

Platform: x86_64-conda-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: parallel, grid, stats4, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: tidygraph(v.1.2.3), ggraph(v.2.1.0), extraChIPs(v.1.2.4), BiocParallel(v.1.32.5), GenomicInteractions(v.1.32.0), InteractionSet(v.1.26.0), SummarizedExperiment(v.1.28.0), Biobase(v.2.58.0), MatrixGenerics(v.1.10.0), matrixStats(v.1.0.0), goseq(v.1.50.0), geneLenDataBase(v.1.34.0), BiasedUrn(v.2.0.10), htmltools(v.0.5.5), msigdbr(v.7.5.1), ggside(v.0.2.2), rlang(v.1.1.1), ggrepel(v.0.9.3), scales(v.1.2.1), magrittr(v.2.0.3), UpSetR(v.1.4.0), VennDiagram(v.1.7.3), futile.logger(v.1.4.3), rtracklayer(v.1.58.0), plyranges(v.1.18.0), GenomicRanges(v.1.50.0), GenomeInfoDb(v.1.34.9), IRanges(v.2.32.0), S4Vectors(v.0.36.0), BiocGenerics(v.0.44.0), reactable(v.0.4.4), yaml(v.2.3.7), pander(v.0.6.5), glue(v.1.6.2), lubridate(v.1.9.2), forcats(v.1.0.0), stringr(v.1.5.0), dplyr(v.1.1.2), purrr(v.1.0.1), readr(v.2.1.4), tidyr(v.1.3.0), tibble(v.3.2.1), ggplot2(v.3.4.2) and tidyverse(v.2.0.0)

loaded via a namespace (and not attached): utf8(v.1.2.3), tidyselect(v.1.2.0), RSQLite(v.2.3.1), AnnotationDbi(v.1.60.0), htmlwidgets(v.1.6.2), munsell(v.0.5.0), codetools(v.0.2-19), interp(v.1.1-4), withr(v.2.5.0), colorspace(v.2.1-0), filelock(v.1.0.2), highr(v.0.10), knitr(v.1.43), rstudioapi(v.0.14), labeling(v.0.4.2), GenomeInfoDbData(v.1.2.9), polyclip(v.1.10-4), farver(v.2.1.1), bit64(v.4.0.5), rprojroot(v.2.0.3), vctrs(v.0.6.3), generics(v.0.1.3), lambda.r(v.1.2.4), xfun(v.0.39), csaw(v.1.32.0), biovizBase(v.1.46.0), timechange(v.0.2.0), BiocFileCache(v.2.6.0), R6(v.2.5.1), doParallel(v.1.0.17), graphlayouts(v.1.0.0), clue(v.0.3-64), locfit(v.1.5-9.8), AnnotationFilter(v.1.22.0), bitops(v.1.0-7), cachem(v.1.0.8), DelayedArray(v.0.24.0), vroom(v.1.6.3), BiocIO(v.1.8.0), nnet(v.7.3-19), gtable(v.0.3.3), ensembldb(v.2.22.0), GlobalOptions(v.0.1.2), splines(v.4.2.2), lazyeval(v.0.2.2), dichromat(v.2.0-0.1), broom(v.1.0.5), checkmate(v.2.2.0), crosstalk(v.1.2.0), GenomicFeatures(v.1.50.2), backports(v.1.4.1), Hmisc(v.5.1-0), EnrichedHeatmap(v.1.27.2), tools(v.4.2.2), ellipsis(v.0.3.2), jquerylib(v.0.1.4), RColorBrewer(v.1.1-3), Rcpp(v.1.0.10), plyr(v.1.8.8), base64enc(v.0.1-3), progress(v.1.2.2), zlibbioc(v.1.44.0), RCurl(v.1.98-1.12), prettyunits(v.1.1.1), rpart(v.4.1.19), deldir(v.1.0-9), viridis(v.0.6.3), GetoptLong(v.1.0.5), reactR(v.0.4.4), cluster(v.2.1.4), here(v.1.0.1), ComplexUpset(v.1.3.3), data.table(v.1.14.8), futile.options(v.1.0.1), circlize(v.0.4.15), ProtGenerics(v.1.30.0), hms(v.1.1.3), patchwork(v.1.1.2), evaluate(v.0.21), XML(v.3.99-0.14), jpeg(v.0.1-10), shape(v.1.4.6), gridExtra(v.2.3), compiler(v.4.2.2), biomaRt(v.2.54.0), crayon(v.1.5.2), mgcv(v.1.8-42), tzdb(v.0.4.0), Formula(v.1.2-5), DBI(v.1.1.3), tweenr(v.2.0.2), formatR(v.1.14), dbplyr(v.2.3.2), ComplexHeatmap(v.2.14.0), MASS(v.7.3-60), rappdirs(v.0.3.3), babelgene(v.22.9), Matrix(v.1.5-4.1), cli(v.3.6.1), metapod(v.1.6.0), Gviz(v.1.42.0), igraph(v.1.4.2), pkgconfig(v.2.0.3), GenomicAlignments(v.1.34.0), foreign(v.0.8-84), xml2(v.1.3.3), foreach(v.1.5.2), bslib(v.0.5.0), XVector(v.0.38.0), VariantAnnotation(v.1.44.0), digest(v.0.6.31), Biostrings(v.2.66.0), rmarkdown(v.2.22), htmlTable(v.2.4.1), edgeR(v.3.40.0), restfulr(v.0.0.15), curl(v.5.0.1), Rsamtools(v.2.14.0), rjson(v.0.2.21), lifecycle(v.1.0.3), nlme(v.3.1-162), jsonlite(v.1.8.5), viridisLite(v.0.4.2), limma(v.3.54.0), BSgenome(v.1.66.3), fansi(v.1.0.4), pillar(v.1.9.0), lattice(v.0.21-8), KEGGREST(v.1.38.0), fastmap(v.1.1.1), httr(v.1.4.6), GO.db(v.3.16.0), png(v.0.1-8), iterators(v.1.0.14), bit(v.4.0.5), ggforce(v.0.4.1), stringi(v.1.7.12), sass(v.0.4.6), blob(v.1.2.4), latticeExtra(v.0.6-30) and memoise(v.2.0.1)